Skip to main content

icu_properties/
script.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Data and APIs for supporting `Script_Extensions` property
6//! values in an efficient structure.
7
8use crate::props::Script;
9use crate::provider::*;
10
11#[cfg(feature = "alloc")]
12use core::iter::FromIterator;
13use core::ops::RangeInclusive;
14#[cfg(feature = "alloc")]
15use icu_collections::codepointinvlist::CodePointInversionList;
16use icu_provider::prelude::*;
17use zerovec::{ule::AsULE, ZeroSlice};
18
19#[cfg(feature = "harfbuzz_traits")]
20pub use crate::harfbuzz::{HarfbuzzScriptData, HarfbuzzScriptDataBorrowed};
21
22/// The number of bits at the low-end of a `ScriptWithExt` value used for
23/// storing the `Script` value (or `extensions` index).
24const SCRIPT_VAL_LENGTH: u16 = 10;
25
26/// The bit mask necessary to retrieve the `Script` value (or `extensions` index)
27/// from a `ScriptWithExt` value.
28const SCRIPT_X_SCRIPT_VAL: u16 = (1 << SCRIPT_VAL_LENGTH) - 1;
29
30/// An internal-use only pseudo-property that represents the values stored in
31/// the trie of the special data structure [`ScriptWithExtensionsProperty`].
32///
33/// Note: The will assume a 12-bit layout. The 2 higher order bits in positions
34/// 11..10 will indicate how to deduce the Script value and Script_Extensions,
35/// and the lower 10 bits 9..0 indicate either the Script value or the index
36/// into the `extensions` structure.
37#[derive(#[automatically_derived]
#[allow(clippy::exhaustive_structs)]
impl ::core::marker::Copy for ScriptWithExt { }Copy, #[automatically_derived]
#[allow(clippy::exhaustive_structs)]
impl ::core::clone::Clone for ScriptWithExt {
    #[inline]
    fn clone(&self) -> ScriptWithExt {
        let _: ::core::clone::AssertParamIsClone<u16>;
        *self
    }
}Clone, #[automatically_derived]
#[allow(clippy::exhaustive_structs)]
impl ::core::fmt::Debug for ScriptWithExt {
    #[inline]
    fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
        ::core::fmt::Formatter::debug_tuple_field1_finish(f, "ScriptWithExt",
            &&self.0)
    }
}Debug, #[automatically_derived]
#[allow(clippy::exhaustive_structs)]
impl ::core::cmp::Eq for ScriptWithExt {
    #[inline]
    #[doc(hidden)]
    #[coverage(off)]
    fn assert_fields_are_eq(&self) {
        let _: ::core::cmp::AssertParamIsEq<u16>;
    }
}Eq, #[automatically_derived]
#[allow(clippy::exhaustive_structs)]
impl ::core::cmp::PartialEq for ScriptWithExt {
    #[inline]
    fn eq(&self, other: &ScriptWithExt) -> bool { self.0 == other.0 }
}PartialEq)]
38#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
39#[cfg_attr(feature = "datagen", derive(databake::Bake))]
40#[cfg_attr(feature = "datagen", databake(path = icu_properties::script))]
41#[repr(transparent)]
42#[doc(hidden)]
43// `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor
44#[allow(clippy::exhaustive_structs)] // this type is stable
45pub struct ScriptWithExt(pub u16);
46
47#[allow(missing_docs)] // These constants don't need individual documentation.
48#[allow(non_upper_case_globals)]
49#[doc(hidden)] // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor
50impl ScriptWithExt {
51    pub const Unknown: ScriptWithExt = ScriptWithExt(0);
52}
53
54impl AsULE for ScriptWithExt {
55    type ULE = <u16 as AsULE>::ULE;
56
57    #[inline]
58    fn to_unaligned(self) -> Self::ULE {
59        Script(self.0).to_unaligned()
60    }
61
62    #[inline]
63    fn from_unaligned(unaligned: Self::ULE) -> Self {
64        ScriptWithExt(Script::from_unaligned(unaligned).0)
65    }
66}
67
68#[doc(hidden)] // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor
69impl ScriptWithExt {
70    /// Returns whether the [`ScriptWithExt`] value has `Script_Extensions` and
71    /// also indicates a Script value of [`Script::Common`].
72    ///
73    /// # Examples
74    ///
75    /// ```
76    /// use icu::properties::script::ScriptWithExt;
77    ///
78    /// assert!(ScriptWithExt(0x04FF).is_common());
79    /// assert!(ScriptWithExt(0x0400).is_common());
80    ///
81    /// assert!(!ScriptWithExt(0x08FF).is_common());
82    /// assert!(!ScriptWithExt(0x0800).is_common());
83    ///
84    /// assert!(!ScriptWithExt(0x0CFF).is_common());
85    /// assert!(!ScriptWithExt(0x0C00).is_common());
86    ///
87    /// assert!(!ScriptWithExt(0xFF).is_common());
88    /// assert!(!ScriptWithExt(0x0).is_common());
89    /// ```
90    pub fn is_common(&self) -> bool {
91        self.0 >> SCRIPT_VAL_LENGTH == 1
92    }
93
94    /// Returns whether the [`ScriptWithExt`] value has `Script_Extensions` and
95    /// also indicates a Script value of [`Script::Inherited`].
96    ///
97    /// # Examples
98    ///
99    /// ```
100    /// use icu::properties::script::ScriptWithExt;
101    ///
102    /// assert!(!ScriptWithExt(0x04FF).is_inherited());
103    /// assert!(!ScriptWithExt(0x0400).is_inherited());
104    ///
105    /// assert!(ScriptWithExt(0x08FF).is_inherited());
106    /// assert!(ScriptWithExt(0x0800).is_inherited());
107    ///
108    /// assert!(!ScriptWithExt(0x0CFF).is_inherited());
109    /// assert!(!ScriptWithExt(0x0C00).is_inherited());
110    ///
111    /// assert!(!ScriptWithExt(0xFF).is_inherited());
112    /// assert!(!ScriptWithExt(0x0).is_inherited());
113    /// ```
114    pub fn is_inherited(&self) -> bool {
115        self.0 >> SCRIPT_VAL_LENGTH == 2
116    }
117
118    /// Returns whether the [`ScriptWithExt`] value has `Script_Extensions` and
119    /// also indicates that the Script value is neither [`Script::Common`] nor
120    /// [`Script::Inherited`].
121    ///
122    /// # Examples
123    ///
124    /// ```
125    /// use icu::properties::script::ScriptWithExt;
126    ///
127    /// assert!(!ScriptWithExt(0x04FF).is_other());
128    /// assert!(!ScriptWithExt(0x0400).is_other());
129    ///
130    /// assert!(!ScriptWithExt(0x08FF).is_other());
131    /// assert!(!ScriptWithExt(0x0800).is_other());
132    ///
133    /// assert!(ScriptWithExt(0x0CFF).is_other());
134    /// assert!(ScriptWithExt(0x0C00).is_other());
135    ///
136    /// assert!(!ScriptWithExt(0xFF).is_other());
137    /// assert!(!ScriptWithExt(0x0).is_other());
138    /// ```
139    pub fn is_other(&self) -> bool {
140        self.0 >> SCRIPT_VAL_LENGTH == 3
141    }
142
143    /// Returns whether the [`ScriptWithExt`] value has `Script_Extensions`.
144    ///
145    /// # Examples
146    ///
147    /// ```
148    /// use icu::properties::script::ScriptWithExt;
149    ///
150    /// assert!(ScriptWithExt(0x04FF).has_extensions());
151    /// assert!(ScriptWithExt(0x0400).has_extensions());
152    ///
153    /// assert!(ScriptWithExt(0x08FF).has_extensions());
154    /// assert!(ScriptWithExt(0x0800).has_extensions());
155    ///
156    /// assert!(ScriptWithExt(0x0CFF).has_extensions());
157    /// assert!(ScriptWithExt(0x0C00).has_extensions());
158    ///
159    /// assert!(!ScriptWithExt(0xFF).has_extensions());
160    /// assert!(!ScriptWithExt(0x0).has_extensions());
161    /// ```
162    pub fn has_extensions(&self) -> bool {
163        let high_order_bits = self.0 >> SCRIPT_VAL_LENGTH;
164        high_order_bits > 0
165    }
166}
167
168impl From<ScriptWithExt> for u32 {
169    fn from(swe: ScriptWithExt) -> Self {
170        swe.0 as u32
171    }
172}
173
174impl From<ScriptWithExt> for Script {
175    fn from(swe: ScriptWithExt) -> Self {
176        Script(swe.0)
177    }
178}
179
180/// A struct that wraps a [`Script`] array, such as in the return value for
181/// [`get_script_extensions_val()`](ScriptWithExtensionsBorrowed::get_script_extensions_val).
182#[derive(#[automatically_derived]
impl<'a> ::core::marker::Copy for ScriptExtensionsSet<'a> { }Copy, #[automatically_derived]
impl<'a> ::core::clone::Clone for ScriptExtensionsSet<'a> {
    #[inline]
    fn clone(&self) -> ScriptExtensionsSet<'a> {
        let _: ::core::clone::AssertParamIsClone<&'a ZeroSlice<Script>>;
        *self
    }
}Clone, #[automatically_derived]
impl<'a> ::core::fmt::Debug for ScriptExtensionsSet<'a> {
    #[inline]
    fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
        ::core::fmt::Formatter::debug_struct_field1_finish(f,
            "ScriptExtensionsSet", "values", &&self.values)
    }
}Debug, #[automatically_derived]
impl<'a> ::core::cmp::Eq for ScriptExtensionsSet<'a> {
    #[inline]
    #[doc(hidden)]
    #[coverage(off)]
    fn assert_fields_are_eq(&self) {
        let _: ::core::cmp::AssertParamIsEq<&'a ZeroSlice<Script>>;
    }
}Eq, #[automatically_derived]
impl<'a> ::core::cmp::PartialEq for ScriptExtensionsSet<'a> {
    #[inline]
    fn eq(&self, other: &ScriptExtensionsSet<'a>) -> bool {
        self.values == other.values
    }
}PartialEq)]
183pub struct ScriptExtensionsSet<'a> {
184    values: &'a ZeroSlice<Script>,
185}
186
187impl<'a> ScriptExtensionsSet<'a> {
188    /// Returns whether this set contains the given script.
189    ///
190    /// # Example
191    ///
192    /// ```
193    /// use icu::properties::props::Script;
194    /// use icu::properties::script::ScriptWithExtensions;
195    /// let swe = ScriptWithExtensions::new();
196    ///
197    /// assert!(swe
198    ///     .get_script_extensions_val('\u{11303}') // GRANTHA SIGN VISARGA
199    ///     .contains(&Script::Grantha));
200    /// ```
201    pub fn contains(&self, x: &Script) -> bool {
202        ZeroSlice::binary_search(self.values, x).is_ok()
203    }
204
205    /// Gets an iterator over the elements.
206    ///
207    /// # Example
208    ///
209    /// ```
210    /// use icu::properties::props::Script;
211    /// use icu::properties::script::ScriptWithExtensions;
212    /// let swe = ScriptWithExtensions::new();
213    ///
214    /// assert_eq!(
215    ///     swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE
216    ///         .iter()
217    ///         .collect::<Vec<_>>(),
218    ///     [Script::Tamil, Script::Grantha]
219    /// );
220    /// ```
221    pub fn iter(&self) -> impl DoubleEndedIterator<Item = Script> + 'a {
222        ZeroSlice::iter(self.values)
223    }
224
225    /// For accessing this set as an array instead of an iterator
226    #[doc(hidden)] // used by FFI code
227    pub fn array_len(&self) -> usize {
228        self.values.len()
229    }
230    /// For accessing this set as an array instead of an iterator
231    #[doc(hidden)] // used by FFI code
232    pub fn array_get(&self, index: usize) -> Option<Script> {
233        self.values.get(index)
234    }
235}
236
237/// A struct that represents the data for the Script and `Script_Extensions` properties.
238///
239/// ✨ *Enabled with the `compiled_data` Cargo feature.*
240///
241/// [📚 Help choosing a constructor](icu_provider::constructors)
242///
243/// Most useful methods are on [`ScriptWithExtensionsBorrowed`] obtained by calling [`ScriptWithExtensions::as_borrowed()`]
244///
245/// # Examples
246///
247/// ```
248/// use icu::properties::script::ScriptWithExtensions;
249/// use icu::properties::props::Script;
250/// let swe = ScriptWithExtensions::new();
251///
252/// // get the `Script` property value
253/// assert_eq!(swe.get_script_val('ـ'), Script::Common); // U+0640 ARABIC TATWEEL
254/// assert_eq!(swe.get_script_val('\u{0650}'), Script::Inherited); // U+0650 ARABIC KASRA
255/// assert_eq!(swe.get_script_val('٠'), Script::Arabic); // // U+0660 ARABIC-INDIC DIGIT ZERO
256/// assert_eq!(swe.get_script_val('ﷲ'), Script::Arabic); // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
257///
258/// // get the `Script_Extensions` property value
259/// assert_eq!(
260///     swe.get_script_extensions_val('ـ') // U+0640 ARABIC TATWEEL
261///         .iter().collect::<Vec<_>>(),
262///     [Script::Arabic, Script::Syriac, Script::Mandaic, Script::Manichaean,
263///          Script::PsalterPahlavi, Script::Adlam, Script::HanifiRohingya, Script::Sogdian,
264///          Script::OldUyghur]
265/// );
266/// assert_eq!(
267///     swe.get_script_extensions_val('🥳') // U+1F973 FACE WITH PARTY HORN AND PARTY HAT
268///         .iter().collect::<Vec<_>>(),
269///     [Script::Common]
270/// );
271/// assert_eq!(
272///     swe.get_script_extensions_val('\u{200D}') // ZERO WIDTH JOINER
273///         .iter().collect::<Vec<_>>(),
274///     [Script::Inherited]
275/// );
276/// assert_eq!(
277///     swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE
278///         .iter().collect::<Vec<_>>(),
279///     [Script::Tamil, Script::Grantha]
280/// );
281///
282/// // check containment of a `Script` value in the `Script_Extensions` value
283/// // U+0650 ARABIC KASRA
284/// assert!(!swe.has_script('\u{0650}', Script::Inherited)); // main Script value
285/// assert!(swe.has_script('\u{0650}', Script::Arabic));
286/// assert!(swe.has_script('\u{0650}', Script::Syriac));
287/// assert!(!swe.has_script('\u{0650}', Script::Thaana));
288///
289/// // get a `CodePointInversionList` for when `Script` value is contained in `Script_Extensions` value
290/// let syriac = swe.get_script_extensions_set(Script::Syriac);
291/// assert!(syriac.contains('\u{0650}')); // ARABIC KASRA
292/// assert!(!syriac.contains('٠')); // ARABIC-INDIC DIGIT ZERO
293/// assert!(!syriac.contains('ﷲ')); // ARABIC LIGATURE ALLAH ISOLATED FORM
294/// assert!(syriac.contains('܀')); // SYRIAC END OF PARAGRAPH
295/// assert!(syriac.contains('\u{074A}')); // SYRIAC BARREKH
296/// ```
297#[derive(#[automatically_derived]
impl ::core::fmt::Debug for ScriptWithExtensions {
    #[inline]
    fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
        ::core::fmt::Formatter::debug_struct_field1_finish(f,
            "ScriptWithExtensions", "data", &&self.data)
    }
}Debug)]
298pub struct ScriptWithExtensions {
299    data: DataPayload<PropertyScriptWithExtensionsV1>,
300}
301
302/// A borrowed wrapper around script extension data, returned by
303/// [`ScriptWithExtensions::as_borrowed()`]. More efficient to query.
304#[derive(#[automatically_derived]
impl<'a> ::core::clone::Clone for ScriptWithExtensionsBorrowed<'a> {
    #[inline]
    fn clone(&self) -> ScriptWithExtensionsBorrowed<'a> {
        let _:
                ::core::clone::AssertParamIsClone<&'a ScriptWithExtensionsProperty<'a>>;
        *self
    }
}Clone, #[automatically_derived]
impl<'a> ::core::marker::Copy for ScriptWithExtensionsBorrowed<'a> { }Copy, #[automatically_derived]
impl<'a> ::core::fmt::Debug for ScriptWithExtensionsBorrowed<'a> {
    #[inline]
    fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
        ::core::fmt::Formatter::debug_struct_field1_finish(f,
            "ScriptWithExtensionsBorrowed", "data", &&self.data)
    }
}Debug)]
305pub struct ScriptWithExtensionsBorrowed<'a> {
306    data: &'a ScriptWithExtensionsProperty<'a>,
307}
308
309impl ScriptWithExtensions {
310    /// Creates a new instance of `ScriptWithExtensionsBorrowed` using compiled data.
311    ///
312    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
313    ///
314    /// [📚 Help choosing a constructor](icu_provider::constructors)
315    #[cfg(feature = "compiled_data")]
316    #[expect(clippy::new_ret_no_self)]
317    pub fn new() -> ScriptWithExtensionsBorrowed<'static> {
318        ScriptWithExtensionsBorrowed::new()
319    }
320
321    icu_provider::gen_buffer_data_constructors!(
322        () -> result: Result<ScriptWithExtensions, DataError>,
323        functions: [
324            new: skip,
325            try_new_with_buffer_provider,
326            try_new_unstable,
327            Self,
328        ]
329    );
330
331    #[doc = "A version of [`Self::new`] that uses custom data provided by a [`DataProvider`].\n\n[\u{1f4da} Help choosing a constructor](icu_provider::constructors)\n\n<div class=\"stab unstable\">\u{26a0}\u{fe0f} The bounds on <tt>provider</tt> may change over time, including in SemVer minor releases.</div>"icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
332    pub fn try_new_unstable(
333        provider: &(impl DataProvider<PropertyScriptWithExtensionsV1> + ?Sized),
334    ) -> Result<Self, DataError> {
335        Ok(ScriptWithExtensions::from_data(
336            provider.load(Default::default())?.payload,
337        ))
338    }
339
340    /// Construct a borrowed version of this type that can be queried.
341    ///
342    /// This avoids a potential small underlying cost per API call (ex: `contains()`) by consolidating it
343    /// up front.
344    #[inline]
345    pub fn as_borrowed(&self) -> ScriptWithExtensionsBorrowed<'_> {
346        ScriptWithExtensionsBorrowed {
347            data: self.data.get(),
348        }
349    }
350
351    /// Construct a new one from loaded data
352    ///
353    /// Typically it is preferable to use getters like [`load_script_with_extensions_unstable()`] instead
354    pub(crate) fn from_data(data: DataPayload<PropertyScriptWithExtensionsV1>) -> Self {
355        Self { data }
356    }
357}
358
359impl<'a> ScriptWithExtensionsBorrowed<'a> {
360    /// Returns the `Script` property value for this code point.
361    ///
362    /// # Examples
363    ///
364    /// ```
365    /// use icu::properties::script::ScriptWithExtensions;
366    /// use icu::properties::props::Script;
367    ///
368    /// let swe = ScriptWithExtensions::new();
369    ///
370    /// // U+0640 ARABIC TATWEEL
371    /// assert_eq!(swe.get_script_val('ـ'), Script::Common); // main Script value
372    /// assert_ne!(swe.get_script_val('ـ'), Script::Arabic);
373    /// assert_ne!(swe.get_script_val('ـ'), Script::Syriac);
374    /// assert_ne!(swe.get_script_val('ـ'), Script::Thaana);
375    ///
376    /// // U+0650 ARABIC KASRA
377    /// assert_eq!(swe.get_script_val('\u{0650}'), Script::Inherited); // main Script value
378    /// assert_ne!(swe.get_script_val('\u{0650}'), Script::Arabic);
379    /// assert_ne!(swe.get_script_val('\u{0650}'), Script::Syriac);
380    /// assert_ne!(swe.get_script_val('\u{0650}'), Script::Thaana);
381    ///
382    /// // U+0660 ARABIC-INDIC DIGIT ZERO
383    /// assert_ne!(swe.get_script_val('٠'), Script::Common);
384    /// assert_eq!(swe.get_script_val('٠'), Script::Arabic); // main Script value
385    /// assert_ne!(swe.get_script_val('٠'), Script::Syriac);
386    /// assert_ne!(swe.get_script_val('٠'), Script::Thaana);
387    ///
388    /// // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
389    /// assert_ne!(swe.get_script_val('ﷲ'), Script::Common);
390    /// assert_eq!(swe.get_script_val('ﷲ'), Script::Arabic); // main Script value
391    /// assert_ne!(swe.get_script_val('ﷲ'), Script::Syriac);
392    /// assert_ne!(swe.get_script_val('ﷲ'), Script::Thaana);
393    /// ```
394    pub fn get_script_val(self, ch: char) -> Script {
395        self.get_script_val32(ch as u32)
396    }
397
398    /// See [`Self::get_script_val`].
399    pub fn get_script_val32(self, code_point: u32) -> Script {
400        let sc_with_ext = self.data.trie.get32(code_point);
401
402        if sc_with_ext.is_other() {
403            let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
404            let scx_val = self.data.extensions.get(ext_idx as usize);
405            let scx_first_sc = scx_val.and_then(|scx| scx.get(0));
406
407            let default_sc_val = Script::Unknown;
408
409            scx_first_sc.unwrap_or(default_sc_val)
410        } else if sc_with_ext.is_common() {
411            Script::Common
412        } else if sc_with_ext.is_inherited() {
413            Script::Inherited
414        } else {
415            let script_val = sc_with_ext.0;
416            Script(script_val)
417        }
418    }
419    // Returns the Script_Extensions value for a code_point when the trie value
420    // is already known.
421    // This private helper method exists to prevent code duplication in callers like
422    // `get_script_extensions_val`, `get_script_extensions_set`, and `has_script`.
423    fn get_scx_val_using_trie_val(
424        self,
425        sc_with_ext_ule: &'a <ScriptWithExt as AsULE>::ULE,
426    ) -> &'a ZeroSlice<Script> {
427        let sc_with_ext = ScriptWithExt::from_unaligned(*sc_with_ext_ule);
428        if sc_with_ext.is_other() {
429            let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
430            let ext_subarray = self.data.extensions.get(ext_idx as usize);
431            // In the OTHER case, where the 2 higher-order bits of the
432            // `ScriptWithExt` value in the trie doesn't indicate the Script value,
433            // the Script value is copied/inserted into the first position of the
434            // `extensions` array. So we must remove it to return the actual scx array val.
435            let scx_slice = ext_subarray
436                .and_then(|zslice| zslice.as_ule_slice().get(1..))
437                .unwrap_or_default();
438            ZeroSlice::from_ule_slice(scx_slice)
439        } else if sc_with_ext.is_common() || sc_with_ext.is_inherited() {
440            let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
441            let scx_val = self.data.extensions.get(ext_idx as usize);
442            scx_val.unwrap_or_default()
443        } else {
444            // Note: `Script` and `ScriptWithExt` are both represented as the same
445            // u16 value when the `ScriptWithExt` has no higher-order bits set.
446            let script_ule_slice = core::slice::from_ref(sc_with_ext_ule);
447            ZeroSlice::from_ule_slice(script_ule_slice)
448        }
449    }
450    /// Return the `Script_Extensions` property value for this code point.
451    ///
452    /// If `code_point` has `Script_Extensions`, then return the Script codes in
453    /// the `Script_Extensions`. In this case, the [`Script`] property value
454    /// (normally `Common` or `Inherited`) is not included in the [`ScriptExtensionsSet`].
455    ///
456    /// If `c` does not have `Script_Extensions`, then the one [`Script`] code is put
457    /// into the [`ScriptExtensionsSet`] and also returned.
458    ///
459    /// If `c` is not a valid code point, then return an empty [`ScriptExtensionsSet`].
460    ///
461    /// # Examples
462    ///
463    /// ```
464    /// use icu::properties::script::ScriptWithExtensions;
465    /// use icu::properties::props::Script;
466    ///
467    /// let swe = ScriptWithExtensions::new();
468    ///
469    /// assert_eq!(
470    ///     swe.get_script_extensions_val('𐓐') // U+104D0 OSAGE CAPITAL LETTER KHA
471    ///         .iter()
472    ///         .collect::<Vec<_>>(),
473    ///     [Script::Osage]
474    /// );
475    /// assert_eq!(
476    ///     swe.get_script_extensions_val('🥳') // U+1F973 FACE WITH PARTY HORN AND PARTY HAT
477    ///         .iter()
478    ///         .collect::<Vec<_>>(),
479    ///     [Script::Common]
480    /// );
481    /// assert_eq!(
482    ///     swe.get_script_extensions_val('\u{200D}') // ZERO WIDTH JOINER
483    ///         .iter()
484    ///         .collect::<Vec<_>>(),
485    ///     [Script::Inherited]
486    /// );
487    /// assert_eq!(
488    ///     swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE
489    ///         .iter()
490    ///         .collect::<Vec<_>>(),
491    ///     [Script::Tamil, Script::Grantha]
492    /// );
493    /// ```
494    pub fn get_script_extensions_val(self, ch: char) -> ScriptExtensionsSet<'a> {
495        self.get_script_extensions_val32(ch as u32)
496    }
497
498    /// See [`Self::get_script_extensions_val`].
499    pub fn get_script_extensions_val32(self, code_point: u32) -> ScriptExtensionsSet<'a> {
500        let sc_with_ext_ule = self.data.trie.get32_ule(code_point);
501
502        ScriptExtensionsSet {
503            values: match sc_with_ext_ule {
504                Some(ule_ref) => self.get_scx_val_using_trie_val(ule_ref),
505                None => ZeroSlice::from_ule_slice(&[]),
506            },
507        }
508    }
509
510    /// Returns whether `script` is contained in the `Script_Extensions`
511    /// property value if the `code_point` has `Script_Extensions`, otherwise
512    /// if the code point does not have `Script_Extensions` then returns
513    /// whether the Script property value matches.
514    ///
515    /// Some characters are commonly used in multiple scripts. For more information,
516    /// see UAX #24: <https://www.unicode.org/reports/tr24/>.
517    ///
518    /// # Examples
519    ///
520    /// ```
521    /// use icu::properties::script::ScriptWithExtensions;
522    /// use icu::properties::props::Script;
523    ///
524    /// let swe = ScriptWithExtensions::new();
525    ///
526    /// // U+0650 ARABIC KASRA
527    /// assert!(!swe.has_script('\u{0650}', Script::Inherited)); // main Script value
528    /// assert!(swe.has_script('\u{0650}', Script::Arabic));
529    /// assert!(swe.has_script('\u{0650}', Script::Syriac));
530    /// assert!(!swe.has_script('\u{0650}', Script::Thaana));
531    ///
532    /// // U+0660 ARABIC-INDIC DIGIT ZERO
533    /// assert!(!swe.has_script('٠', Script::Common)); // main Script value
534    /// assert!(swe.has_script('٠', Script::Arabic));
535    /// assert!(!swe.has_script('٠', Script::Syriac));
536    /// assert!(swe.has_script('٠', Script::Thaana));
537    ///
538    /// // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
539    /// assert!(!swe.has_script('ﷲ', Script::Common));
540    /// assert!(swe.has_script('ﷲ', Script::Arabic)); // main Script value
541    /// assert!(!swe.has_script('ﷲ', Script::Syriac));
542    /// assert!(swe.has_script('ﷲ', Script::Thaana));
543    /// ```
544    pub fn has_script(self, ch: char, script: Script) -> bool {
545        self.has_script32(ch as u32, script)
546    }
547
548    /// See [`Self::has_script`].
549    pub fn has_script32(self, code_point: u32, script: Script) -> bool {
550        let sc_with_ext_ule = if let Some(scwe_ule) = self.data.trie.get32_ule(code_point) {
551            scwe_ule
552        } else {
553            return false;
554        };
555        let sc_with_ext = <ScriptWithExt as AsULE>::from_unaligned(*sc_with_ext_ule);
556
557        if !sc_with_ext.has_extensions() {
558            let script_val = sc_with_ext.0;
559            script == Script(script_val)
560        } else {
561            let scx_val = self.get_scx_val_using_trie_val(sc_with_ext_ule);
562            let script_find = scx_val.iter().find(|&sc| sc == script);
563            script_find.is_some()
564        }
565    }
566
567    /// Returns all of the matching `CodePointMapRange`s for the given [`Script`]
568    /// in which `has_script` will return true for all of the contained code points.
569    ///
570    /// # Examples
571    ///
572    /// ```
573    /// use icu::properties::props::Script;
574    /// use icu::properties::script::ScriptWithExtensions;
575    ///
576    /// let swe = ScriptWithExtensions::new();
577    ///
578    /// let syriac_script_extensions_ranges =
579    ///     swe.get_script_extensions_ranges(Script::Syriac);
580    ///
581    /// let exp_ranges = [
582    ///     0x0303..=0x0304, // COMBINING TILDE..COMBINING MACRON
583    ///     0x0307..=0x0308, // COMBINING DOT ABOVE..COMBINING DIAERESIS
584    ///     0x030A..=0x030A, // COMBINING RING ABOVE
585    ///     0x0323..=0x0325, // COMBINING DOT BELOW..COMBINING RING BELOW
586    ///     0x032D..=0x032E, // COMBINING CIRCUMFLEX ACCENT BELOW..COMBINING BREVE BELOW
587    ///     0x0330..=0x0331, // COMBINING TILDE BELOW..COMBINING MACRON BELOW
588    ///     0x060C..=0x060C, // ARABIC COMMA
589    ///     0x061B..=0x061C, // ARABIC SEMICOLON, ARABIC LETTER MARK
590    ///     0x061F..=0x061F, // ARABIC QUESTION MARK
591    ///     0x0640..=0x0640, // ARABIC TATWEEL
592    ///     0x064B..=0x0655, // ARABIC FATHATAN..ARABIC HAMZA BELOW
593    ///     0x0670..=0x0670, // ARABIC LETTER SUPERSCRIPT ALEF
594    ///     0x0700..=0x070D, // Syriac block begins at U+0700
595    ///     0x070F..=0x074A, // Syriac block
596    ///     0x074D..=0x074F, // Syriac block ends at U+074F
597    ///     0x0860..=0x086A, // Syriac Supplement block is U+0860..=U+086F
598    ///     0x1DF8..=0x1DF8, // COMBINING DOT ABOVE LEFT
599    ///     0x1DFA..=0x1DFA, // COMBINING DOT BELOW LEFT
600    /// ];
601    ///
602    /// assert_eq!(
603    ///     syriac_script_extensions_ranges.collect::<Vec<_>>(),
604    ///     exp_ranges
605    /// );
606    /// ```
607    pub fn get_script_extensions_ranges(
608        self,
609        script: Script,
610    ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
611        self.data
612            .trie
613            .iter_ranges_mapped(move |value| {
614                let sc_with_ext = ScriptWithExt(value.0);
615                if sc_with_ext.has_extensions() {
616                    self.get_scx_val_using_trie_val(&sc_with_ext.to_unaligned())
617                        .iter()
618                        .any(|sc| sc == script)
619                } else {
620                    script == sc_with_ext.into()
621                }
622            })
623            .filter(|v| v.value)
624            .map(|v| v.range)
625    }
626
627    /// Returns a [`CodePointInversionList`] for the given [`Script`] which represents all
628    /// code points for which `has_script` will return true.
629    ///
630    /// ✨ *Enabled with the `alloc` Cargo feature.*
631    ///
632    /// # Examples
633    ///
634    /// ```
635    /// use icu::properties::script::ScriptWithExtensions;
636    /// use icu::properties::props::Script;
637    ///
638    /// let swe = ScriptWithExtensions::new();
639    ///
640    /// let syriac = swe.get_script_extensions_set(Script::Syriac);
641    ///
642    /// assert!(!syriac.contains('؞')); // ARABIC TRIPLE DOT PUNCTUATION MARK
643    /// assert!(syriac.contains('؟')); // ARABIC QUESTION MARK
644    /// assert!(!syriac.contains('ؠ')); // ARABIC LETTER KASHMIRI YEH
645    ///
646    /// assert!(syriac.contains('܀')); // SYRIAC END OF PARAGRAPH
647    /// assert!(syriac.contains('\u{074A}')); // SYRIAC BARREKH
648    /// assert!(!syriac.contains('\u{074B}')); // unassigned
649    /// assert!(syriac.contains('ݏ')); // SYRIAC LETTER SOGDIAN FE
650    /// assert!(!syriac.contains('ݐ')); // ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW
651    ///
652    /// assert!(syriac.contains('\u{1DF8}')); // COMBINING DOT ABOVE LEFT
653    /// assert!(!syriac.contains('\u{1DF9}')); // COMBINING WIDE INVERTED BRIDGE BELOW
654    /// assert!(syriac.contains('\u{1DFA}')); // COMBINING DOT BELOW LEFT
655    /// assert!(!syriac.contains('\u{1DFB}')); // COMBINING DELETION MARK
656    /// ```
657    #[cfg(feature = "alloc")]
658    pub fn get_script_extensions_set(self, script: Script) -> CodePointInversionList<'a> {
659        CodePointInversionList::from_iter(self.get_script_extensions_ranges(script))
660    }
661}
662
663#[cfg(feature = "compiled_data")]
664impl Default for ScriptWithExtensionsBorrowed<'static> {
665    fn default() -> Self {
666        Self::new()
667    }
668}
669
670impl ScriptWithExtensionsBorrowed<'static> {
671    /// Creates a new instance of `ScriptWithExtensionsBorrowed` using compiled data.
672    ///
673    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
674    ///
675    /// [📚 Help choosing a constructor](icu_provider::constructors)
676    #[cfg(feature = "compiled_data")]
677    pub fn new() -> Self {
678        Self {
679            data: Baked::SINGLETON_PROPERTY_SCRIPT_WITH_EXTENSIONS_V1,
680        }
681    }
682
683    /// Cheaply converts a [`ScriptWithExtensionsBorrowed<'static>`] into a [`ScriptWithExtensions`].
684    ///
685    /// Note: Due to branching and indirection, using [`ScriptWithExtensions`] might inhibit some
686    /// compile-time optimizations that are possible with [`ScriptWithExtensionsBorrowed`].
687    pub const fn static_to_owned(self) -> ScriptWithExtensions {
688        ScriptWithExtensions {
689            data: DataPayload::from_static_ref(self.data),
690        }
691    }
692}
693
694#[cfg(test)]
695mod tests {
696    use super::*;
697    #[test]
698    /// Regression test for <https://github.com/unicode-org/icu4x/issues/6041>
699    fn test_scx_regression_6041() {
700        let scripts = ScriptWithExtensions::new()
701            .get_script_extensions_val('\u{2bc}')
702            .iter()
703            .collect::<Vec<_>>();
704        assert_eq!(
705            scripts,
706            [
707                Script::Bengali,
708                Script::Cyrillic,
709                Script::Devanagari,
710                Script::Latin,
711                Script::Thai,
712                Script::Lisu,
713                Script::Toto
714            ]
715        );
716    }
717}