icu_properties/script.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Data and APIs for supporting `Script_Extensions` property
6//! values in an efficient structure.
7
8use crate::props::Script;
9use crate::provider::*;
10
11#[cfg(feature = "alloc")]
12use core::iter::FromIterator;
13use core::ops::RangeInclusive;
14#[cfg(feature = "alloc")]
15use icu_collections::codepointinvlist::CodePointInversionList;
16use icu_provider::prelude::*;
17use zerovec::{ule::AsULE, ZeroSlice};
18
19#[cfg(feature = "harfbuzz_traits")]
20pub use crate::harfbuzz::{HarfbuzzScriptData, HarfbuzzScriptDataBorrowed};
21
22/// The number of bits at the low-end of a `ScriptWithExt` value used for
23/// storing the `Script` value (or `extensions` index).
24const SCRIPT_VAL_LENGTH: u16 = 10;
25
26/// The bit mask necessary to retrieve the `Script` value (or `extensions` index)
27/// from a `ScriptWithExt` value.
28const SCRIPT_X_SCRIPT_VAL: u16 = (1 << SCRIPT_VAL_LENGTH) - 1;
29
30/// An internal-use only pseudo-property that represents the values stored in
31/// the trie of the special data structure [`ScriptWithExtensionsProperty`].
32///
33/// Note: The will assume a 12-bit layout. The 2 higher order bits in positions
34/// 11..10 will indicate how to deduce the Script value and Script_Extensions,
35/// and the lower 10 bits 9..0 indicate either the Script value or the index
36/// into the `extensions` structure.
37#[derive(Copy, Clone, Debug, Eq, PartialEq)]
38#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
39#[cfg_attr(feature = "datagen", derive(databake::Bake))]
40#[cfg_attr(feature = "datagen", databake(path = icu_properties::script))]
41#[repr(transparent)]
42#[doc(hidden)]
43// `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor
44#[allow(clippy::exhaustive_structs)] // this type is stable
45pub struct ScriptWithExt(pub u16);
46
47#[allow(missing_docs)] // These constants don't need individual documentation.
48#[allow(non_upper_case_globals)]
49#[doc(hidden)] // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor
50impl ScriptWithExt {
51 pub const Unknown: ScriptWithExt = ScriptWithExt(0);
52}
53
54impl AsULE for ScriptWithExt {
55 type ULE = <u16 as AsULE>::ULE;
56
57 #[inline]
58 fn to_unaligned(self) -> Self::ULE {
59 Script(self.0).to_unaligned()
60 }
61
62 #[inline]
63 fn from_unaligned(unaligned: Self::ULE) -> Self {
64 ScriptWithExt(Script::from_unaligned(unaligned).0)
65 }
66}
67
68#[doc(hidden)] // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor
69impl ScriptWithExt {
70 /// Returns whether the [`ScriptWithExt`] value has `Script_Extensions` and
71 /// also indicates a Script value of [`Script::Common`].
72 ///
73 /// # Examples
74 ///
75 /// ```
76 /// use icu::properties::script::ScriptWithExt;
77 ///
78 /// assert!(ScriptWithExt(0x04FF).is_common());
79 /// assert!(ScriptWithExt(0x0400).is_common());
80 ///
81 /// assert!(!ScriptWithExt(0x08FF).is_common());
82 /// assert!(!ScriptWithExt(0x0800).is_common());
83 ///
84 /// assert!(!ScriptWithExt(0x0CFF).is_common());
85 /// assert!(!ScriptWithExt(0x0C00).is_common());
86 ///
87 /// assert!(!ScriptWithExt(0xFF).is_common());
88 /// assert!(!ScriptWithExt(0x0).is_common());
89 /// ```
90 pub fn is_common(&self) -> bool {
91 self.0 >> SCRIPT_VAL_LENGTH == 1
92 }
93
94 /// Returns whether the [`ScriptWithExt`] value has `Script_Extensions` and
95 /// also indicates a Script value of [`Script::Inherited`].
96 ///
97 /// # Examples
98 ///
99 /// ```
100 /// use icu::properties::script::ScriptWithExt;
101 ///
102 /// assert!(!ScriptWithExt(0x04FF).is_inherited());
103 /// assert!(!ScriptWithExt(0x0400).is_inherited());
104 ///
105 /// assert!(ScriptWithExt(0x08FF).is_inherited());
106 /// assert!(ScriptWithExt(0x0800).is_inherited());
107 ///
108 /// assert!(!ScriptWithExt(0x0CFF).is_inherited());
109 /// assert!(!ScriptWithExt(0x0C00).is_inherited());
110 ///
111 /// assert!(!ScriptWithExt(0xFF).is_inherited());
112 /// assert!(!ScriptWithExt(0x0).is_inherited());
113 /// ```
114 pub fn is_inherited(&self) -> bool {
115 self.0 >> SCRIPT_VAL_LENGTH == 2
116 }
117
118 /// Returns whether the [`ScriptWithExt`] value has `Script_Extensions` and
119 /// also indicates that the Script value is neither [`Script::Common`] nor
120 /// [`Script::Inherited`].
121 ///
122 /// # Examples
123 ///
124 /// ```
125 /// use icu::properties::script::ScriptWithExt;
126 ///
127 /// assert!(!ScriptWithExt(0x04FF).is_other());
128 /// assert!(!ScriptWithExt(0x0400).is_other());
129 ///
130 /// assert!(!ScriptWithExt(0x08FF).is_other());
131 /// assert!(!ScriptWithExt(0x0800).is_other());
132 ///
133 /// assert!(ScriptWithExt(0x0CFF).is_other());
134 /// assert!(ScriptWithExt(0x0C00).is_other());
135 ///
136 /// assert!(!ScriptWithExt(0xFF).is_other());
137 /// assert!(!ScriptWithExt(0x0).is_other());
138 /// ```
139 pub fn is_other(&self) -> bool {
140 self.0 >> SCRIPT_VAL_LENGTH == 3
141 }
142
143 /// Returns whether the [`ScriptWithExt`] value has `Script_Extensions`.
144 ///
145 /// # Examples
146 ///
147 /// ```
148 /// use icu::properties::script::ScriptWithExt;
149 ///
150 /// assert!(ScriptWithExt(0x04FF).has_extensions());
151 /// assert!(ScriptWithExt(0x0400).has_extensions());
152 ///
153 /// assert!(ScriptWithExt(0x08FF).has_extensions());
154 /// assert!(ScriptWithExt(0x0800).has_extensions());
155 ///
156 /// assert!(ScriptWithExt(0x0CFF).has_extensions());
157 /// assert!(ScriptWithExt(0x0C00).has_extensions());
158 ///
159 /// assert!(!ScriptWithExt(0xFF).has_extensions());
160 /// assert!(!ScriptWithExt(0x0).has_extensions());
161 /// ```
162 pub fn has_extensions(&self) -> bool {
163 let high_order_bits = self.0 >> SCRIPT_VAL_LENGTH;
164 high_order_bits > 0
165 }
166}
167
168impl From<ScriptWithExt> for u32 {
169 fn from(swe: ScriptWithExt) -> Self {
170 swe.0 as u32
171 }
172}
173
174impl From<ScriptWithExt> for Script {
175 fn from(swe: ScriptWithExt) -> Self {
176 Script(swe.0)
177 }
178}
179
180/// A struct that wraps a [`Script`] array, such as in the return value for
181/// [`get_script_extensions_val()`](ScriptWithExtensionsBorrowed::get_script_extensions_val).
182#[derive(Copy, Clone, Debug, Eq, PartialEq)]
183pub struct ScriptExtensionsSet<'a> {
184 values: &'a ZeroSlice<Script>,
185}
186
187impl<'a> ScriptExtensionsSet<'a> {
188 /// Returns whether this set contains the given script.
189 ///
190 /// # Example
191 ///
192 /// ```
193 /// use icu::properties::props::Script;
194 /// use icu::properties::script::ScriptWithExtensions;
195 /// let swe = ScriptWithExtensions::new();
196 ///
197 /// assert!(swe
198 /// .get_script_extensions_val('\u{11303}') // GRANTHA SIGN VISARGA
199 /// .contains(&Script::Grantha));
200 /// ```
201 pub fn contains(&self, x: &Script) -> bool {
202 ZeroSlice::binary_search(self.values, x).is_ok()
203 }
204
205 /// Gets an iterator over the elements.
206 ///
207 /// # Example
208 ///
209 /// ```
210 /// use icu::properties::props::Script;
211 /// use icu::properties::script::ScriptWithExtensions;
212 /// let swe = ScriptWithExtensions::new();
213 ///
214 /// assert_eq!(
215 /// swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE
216 /// .iter()
217 /// .collect::<Vec<_>>(),
218 /// [Script::Tamil, Script::Grantha]
219 /// );
220 /// ```
221 pub fn iter(&self) -> impl DoubleEndedIterator<Item = Script> + 'a {
222 ZeroSlice::iter(self.values)
223 }
224
225 /// For accessing this set as an array instead of an iterator
226 #[doc(hidden)] // used by FFI code
227 pub fn array_len(&self) -> usize {
228 self.values.len()
229 }
230 /// For accessing this set as an array instead of an iterator
231 #[doc(hidden)] // used by FFI code
232 pub fn array_get(&self, index: usize) -> Option<Script> {
233 self.values.get(index)
234 }
235}
236
237/// A struct that represents the data for the Script and `Script_Extensions` properties.
238///
239/// ✨ *Enabled with the `compiled_data` Cargo feature.*
240///
241/// [📚 Help choosing a constructor](icu_provider::constructors)
242///
243/// Most useful methods are on [`ScriptWithExtensionsBorrowed`] obtained by calling [`ScriptWithExtensions::as_borrowed()`]
244///
245/// # Examples
246///
247/// ```
248/// use icu::properties::script::ScriptWithExtensions;
249/// use icu::properties::props::Script;
250/// let swe = ScriptWithExtensions::new();
251///
252/// // get the `Script` property value
253/// assert_eq!(swe.get_script_val('ـ'), Script::Common); // U+0640 ARABIC TATWEEL
254/// assert_eq!(swe.get_script_val('\u{0650}'), Script::Inherited); // U+0650 ARABIC KASRA
255/// assert_eq!(swe.get_script_val('٠'), Script::Arabic); // // U+0660 ARABIC-INDIC DIGIT ZERO
256/// assert_eq!(swe.get_script_val('ﷲ'), Script::Arabic); // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
257///
258/// // get the `Script_Extensions` property value
259/// assert_eq!(
260/// swe.get_script_extensions_val('ـ') // U+0640 ARABIC TATWEEL
261/// .iter().collect::<Vec<_>>(),
262/// [Script::Arabic, Script::Syriac, Script::Mandaic, Script::Manichaean,
263/// Script::PsalterPahlavi, Script::Adlam, Script::HanifiRohingya, Script::Sogdian,
264/// Script::OldUyghur]
265/// );
266/// assert_eq!(
267/// swe.get_script_extensions_val('🥳') // U+1F973 FACE WITH PARTY HORN AND PARTY HAT
268/// .iter().collect::<Vec<_>>(),
269/// [Script::Common]
270/// );
271/// assert_eq!(
272/// swe.get_script_extensions_val('\u{200D}') // ZERO WIDTH JOINER
273/// .iter().collect::<Vec<_>>(),
274/// [Script::Inherited]
275/// );
276/// assert_eq!(
277/// swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE
278/// .iter().collect::<Vec<_>>(),
279/// [Script::Tamil, Script::Grantha]
280/// );
281///
282/// // check containment of a `Script` value in the `Script_Extensions` value
283/// // U+0650 ARABIC KASRA
284/// assert!(!swe.has_script('\u{0650}', Script::Inherited)); // main Script value
285/// assert!(swe.has_script('\u{0650}', Script::Arabic));
286/// assert!(swe.has_script('\u{0650}', Script::Syriac));
287/// assert!(!swe.has_script('\u{0650}', Script::Thaana));
288///
289/// // get a `CodePointInversionList` for when `Script` value is contained in `Script_Extensions` value
290/// let syriac = swe.get_script_extensions_set(Script::Syriac);
291/// assert!(syriac.contains('\u{0650}')); // ARABIC KASRA
292/// assert!(!syriac.contains('٠')); // ARABIC-INDIC DIGIT ZERO
293/// assert!(!syriac.contains('ﷲ')); // ARABIC LIGATURE ALLAH ISOLATED FORM
294/// assert!(syriac.contains('܀')); // SYRIAC END OF PARAGRAPH
295/// assert!(syriac.contains('\u{074A}')); // SYRIAC BARREKH
296/// ```
297#[derive(Debug)]
298pub struct ScriptWithExtensions {
299 data: DataPayload<PropertyScriptWithExtensionsV1>,
300}
301
302/// A borrowed wrapper around script extension data, returned by
303/// [`ScriptWithExtensions::as_borrowed()`]. More efficient to query.
304#[derive(Clone, Copy, Debug)]
305pub struct ScriptWithExtensionsBorrowed<'a> {
306 data: &'a ScriptWithExtensionsProperty<'a>,
307}
308
309impl ScriptWithExtensions {
310 /// Creates a new instance of `ScriptWithExtensionsBorrowed` using compiled data.
311 ///
312 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
313 ///
314 /// [📚 Help choosing a constructor](icu_provider::constructors)
315 #[cfg(feature = "compiled_data")]
316 #[expect(clippy::new_ret_no_self)]
317 pub fn new() -> ScriptWithExtensionsBorrowed<'static> {
318 ScriptWithExtensionsBorrowed::new()
319 }
320
321 icu_provider::gen_buffer_data_constructors!(
322 () -> result: Result<ScriptWithExtensions, DataError>,
323 functions: [
324 new: skip,
325 try_new_with_buffer_provider,
326 try_new_unstable,
327 Self,
328 ]
329 );
330
331 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
332 pub fn try_new_unstable(
333 provider: &(impl DataProvider<PropertyScriptWithExtensionsV1> + ?Sized),
334 ) -> Result<Self, DataError> {
335 Ok(ScriptWithExtensions::from_data(
336 provider.load(Default::default())?.payload,
337 ))
338 }
339
340 /// Construct a borrowed version of this type that can be queried.
341 ///
342 /// This avoids a potential small underlying cost per API call (ex: `contains()`) by consolidating it
343 /// up front.
344 #[inline]
345 pub fn as_borrowed(&self) -> ScriptWithExtensionsBorrowed<'_> {
346 ScriptWithExtensionsBorrowed {
347 data: self.data.get(),
348 }
349 }
350
351 /// Construct a new one from loaded data
352 ///
353 /// Typically it is preferable to use getters like [`load_script_with_extensions_unstable()`] instead
354 pub(crate) fn from_data(data: DataPayload<PropertyScriptWithExtensionsV1>) -> Self {
355 Self { data }
356 }
357}
358
359impl<'a> ScriptWithExtensionsBorrowed<'a> {
360 /// Returns the `Script` property value for this code point.
361 ///
362 /// # Examples
363 ///
364 /// ```
365 /// use icu::properties::script::ScriptWithExtensions;
366 /// use icu::properties::props::Script;
367 ///
368 /// let swe = ScriptWithExtensions::new();
369 ///
370 /// // U+0640 ARABIC TATWEEL
371 /// assert_eq!(swe.get_script_val('ـ'), Script::Common); // main Script value
372 /// assert_ne!(swe.get_script_val('ـ'), Script::Arabic);
373 /// assert_ne!(swe.get_script_val('ـ'), Script::Syriac);
374 /// assert_ne!(swe.get_script_val('ـ'), Script::Thaana);
375 ///
376 /// // U+0650 ARABIC KASRA
377 /// assert_eq!(swe.get_script_val('\u{0650}'), Script::Inherited); // main Script value
378 /// assert_ne!(swe.get_script_val('\u{0650}'), Script::Arabic);
379 /// assert_ne!(swe.get_script_val('\u{0650}'), Script::Syriac);
380 /// assert_ne!(swe.get_script_val('\u{0650}'), Script::Thaana);
381 ///
382 /// // U+0660 ARABIC-INDIC DIGIT ZERO
383 /// assert_ne!(swe.get_script_val('٠'), Script::Common);
384 /// assert_eq!(swe.get_script_val('٠'), Script::Arabic); // main Script value
385 /// assert_ne!(swe.get_script_val('٠'), Script::Syriac);
386 /// assert_ne!(swe.get_script_val('٠'), Script::Thaana);
387 ///
388 /// // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
389 /// assert_ne!(swe.get_script_val('ﷲ'), Script::Common);
390 /// assert_eq!(swe.get_script_val('ﷲ'), Script::Arabic); // main Script value
391 /// assert_ne!(swe.get_script_val('ﷲ'), Script::Syriac);
392 /// assert_ne!(swe.get_script_val('ﷲ'), Script::Thaana);
393 /// ```
394 pub fn get_script_val(self, ch: char) -> Script {
395 self.get_script_val32(ch as u32)
396 }
397
398 /// See [`Self::get_script_val`].
399 pub fn get_script_val32(self, code_point: u32) -> Script {
400 let sc_with_ext = self.data.trie.get32(code_point);
401
402 if sc_with_ext.is_other() {
403 let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
404 let scx_val = self.data.extensions.get(ext_idx as usize);
405 let scx_first_sc = scx_val.and_then(|scx| scx.get(0));
406
407 let default_sc_val = Script::Unknown;
408
409 scx_first_sc.unwrap_or(default_sc_val)
410 } else if sc_with_ext.is_common() {
411 Script::Common
412 } else if sc_with_ext.is_inherited() {
413 Script::Inherited
414 } else {
415 let script_val = sc_with_ext.0;
416 Script(script_val)
417 }
418 }
419 // Returns the Script_Extensions value for a code_point when the trie value
420 // is already known.
421 // This private helper method exists to prevent code duplication in callers like
422 // `get_script_extensions_val`, `get_script_extensions_set`, and `has_script`.
423 fn get_scx_val_using_trie_val(
424 self,
425 sc_with_ext_ule: &'a <ScriptWithExt as AsULE>::ULE,
426 ) -> &'a ZeroSlice<Script> {
427 let sc_with_ext = ScriptWithExt::from_unaligned(*sc_with_ext_ule);
428 if sc_with_ext.is_other() {
429 let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
430 let ext_subarray = self.data.extensions.get(ext_idx as usize);
431 // In the OTHER case, where the 2 higher-order bits of the
432 // `ScriptWithExt` value in the trie doesn't indicate the Script value,
433 // the Script value is copied/inserted into the first position of the
434 // `extensions` array. So we must remove it to return the actual scx array val.
435 let scx_slice = ext_subarray
436 .and_then(|zslice| zslice.as_ule_slice().get(1..))
437 .unwrap_or_default();
438 ZeroSlice::from_ule_slice(scx_slice)
439 } else if sc_with_ext.is_common() || sc_with_ext.is_inherited() {
440 let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
441 let scx_val = self.data.extensions.get(ext_idx as usize);
442 scx_val.unwrap_or_default()
443 } else {
444 // Note: `Script` and `ScriptWithExt` are both represented as the same
445 // u16 value when the `ScriptWithExt` has no higher-order bits set.
446 let script_ule_slice = core::slice::from_ref(sc_with_ext_ule);
447 ZeroSlice::from_ule_slice(script_ule_slice)
448 }
449 }
450 /// Return the `Script_Extensions` property value for this code point.
451 ///
452 /// If `code_point` has `Script_Extensions`, then return the Script codes in
453 /// the `Script_Extensions`. In this case, the [`Script`] property value
454 /// (normally `Common` or `Inherited`) is not included in the [`ScriptExtensionsSet`].
455 ///
456 /// If `c` does not have `Script_Extensions`, then the one [`Script`] code is put
457 /// into the [`ScriptExtensionsSet`] and also returned.
458 ///
459 /// If `c` is not a valid code point, then return an empty [`ScriptExtensionsSet`].
460 ///
461 /// # Examples
462 ///
463 /// ```
464 /// use icu::properties::script::ScriptWithExtensions;
465 /// use icu::properties::props::Script;
466 ///
467 /// let swe = ScriptWithExtensions::new();
468 ///
469 /// assert_eq!(
470 /// swe.get_script_extensions_val('𐓐') // U+104D0 OSAGE CAPITAL LETTER KHA
471 /// .iter()
472 /// .collect::<Vec<_>>(),
473 /// [Script::Osage]
474 /// );
475 /// assert_eq!(
476 /// swe.get_script_extensions_val('🥳') // U+1F973 FACE WITH PARTY HORN AND PARTY HAT
477 /// .iter()
478 /// .collect::<Vec<_>>(),
479 /// [Script::Common]
480 /// );
481 /// assert_eq!(
482 /// swe.get_script_extensions_val('\u{200D}') // ZERO WIDTH JOINER
483 /// .iter()
484 /// .collect::<Vec<_>>(),
485 /// [Script::Inherited]
486 /// );
487 /// assert_eq!(
488 /// swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE
489 /// .iter()
490 /// .collect::<Vec<_>>(),
491 /// [Script::Tamil, Script::Grantha]
492 /// );
493 /// ```
494 pub fn get_script_extensions_val(self, ch: char) -> ScriptExtensionsSet<'a> {
495 self.get_script_extensions_val32(ch as u32)
496 }
497
498 /// See [`Self::get_script_extensions_val`].
499 pub fn get_script_extensions_val32(self, code_point: u32) -> ScriptExtensionsSet<'a> {
500 let sc_with_ext_ule = self.data.trie.get32_ule(code_point);
501
502 ScriptExtensionsSet {
503 values: match sc_with_ext_ule {
504 Some(ule_ref) => self.get_scx_val_using_trie_val(ule_ref),
505 None => ZeroSlice::from_ule_slice(&[]),
506 },
507 }
508 }
509
510 /// Returns whether `script` is contained in the `Script_Extensions`
511 /// property value if the `code_point` has `Script_Extensions`, otherwise
512 /// if the code point does not have `Script_Extensions` then returns
513 /// whether the Script property value matches.
514 ///
515 /// Some characters are commonly used in multiple scripts. For more information,
516 /// see UAX #24: <https://www.unicode.org/reports/tr24/>.
517 ///
518 /// # Examples
519 ///
520 /// ```
521 /// use icu::properties::script::ScriptWithExtensions;
522 /// use icu::properties::props::Script;
523 ///
524 /// let swe = ScriptWithExtensions::new();
525 ///
526 /// // U+0650 ARABIC KASRA
527 /// assert!(!swe.has_script('\u{0650}', Script::Inherited)); // main Script value
528 /// assert!(swe.has_script('\u{0650}', Script::Arabic));
529 /// assert!(swe.has_script('\u{0650}', Script::Syriac));
530 /// assert!(!swe.has_script('\u{0650}', Script::Thaana));
531 ///
532 /// // U+0660 ARABIC-INDIC DIGIT ZERO
533 /// assert!(!swe.has_script('٠', Script::Common)); // main Script value
534 /// assert!(swe.has_script('٠', Script::Arabic));
535 /// assert!(!swe.has_script('٠', Script::Syriac));
536 /// assert!(swe.has_script('٠', Script::Thaana));
537 ///
538 /// // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
539 /// assert!(!swe.has_script('ﷲ', Script::Common));
540 /// assert!(swe.has_script('ﷲ', Script::Arabic)); // main Script value
541 /// assert!(!swe.has_script('ﷲ', Script::Syriac));
542 /// assert!(swe.has_script('ﷲ', Script::Thaana));
543 /// ```
544 pub fn has_script(self, ch: char, script: Script) -> bool {
545 self.has_script32(ch as u32, script)
546 }
547
548 /// See [`Self::has_script`].
549 pub fn has_script32(self, code_point: u32, script: Script) -> bool {
550 let sc_with_ext_ule = if let Some(scwe_ule) = self.data.trie.get32_ule(code_point) {
551 scwe_ule
552 } else {
553 return false;
554 };
555 let sc_with_ext = <ScriptWithExt as AsULE>::from_unaligned(*sc_with_ext_ule);
556
557 if !sc_with_ext.has_extensions() {
558 let script_val = sc_with_ext.0;
559 script == Script(script_val)
560 } else {
561 let scx_val = self.get_scx_val_using_trie_val(sc_with_ext_ule);
562 let script_find = scx_val.iter().find(|&sc| sc == script);
563 script_find.is_some()
564 }
565 }
566
567 /// Returns all of the matching `CodePointMapRange`s for the given [`Script`]
568 /// in which `has_script` will return true for all of the contained code points.
569 ///
570 /// # Examples
571 ///
572 /// ```
573 /// use icu::properties::props::Script;
574 /// use icu::properties::script::ScriptWithExtensions;
575 ///
576 /// let swe = ScriptWithExtensions::new();
577 ///
578 /// let syriac_script_extensions_ranges =
579 /// swe.get_script_extensions_ranges(Script::Syriac);
580 ///
581 /// let exp_ranges = [
582 /// 0x0303..=0x0304, // COMBINING TILDE..COMBINING MACRON
583 /// 0x0307..=0x0308, // COMBINING DOT ABOVE..COMBINING DIAERESIS
584 /// 0x030A..=0x030A, // COMBINING RING ABOVE
585 /// 0x0323..=0x0325, // COMBINING DOT BELOW..COMBINING RING BELOW
586 /// 0x032D..=0x032E, // COMBINING CIRCUMFLEX ACCENT BELOW..COMBINING BREVE BELOW
587 /// 0x0330..=0x0331, // COMBINING TILDE BELOW..COMBINING MACRON BELOW
588 /// 0x060C..=0x060C, // ARABIC COMMA
589 /// 0x061B..=0x061C, // ARABIC SEMICOLON, ARABIC LETTER MARK
590 /// 0x061F..=0x061F, // ARABIC QUESTION MARK
591 /// 0x0640..=0x0640, // ARABIC TATWEEL
592 /// 0x064B..=0x0655, // ARABIC FATHATAN..ARABIC HAMZA BELOW
593 /// 0x0670..=0x0670, // ARABIC LETTER SUPERSCRIPT ALEF
594 /// 0x0700..=0x070D, // Syriac block begins at U+0700
595 /// 0x070F..=0x074A, // Syriac block
596 /// 0x074D..=0x074F, // Syriac block ends at U+074F
597 /// 0x0860..=0x086A, // Syriac Supplement block is U+0860..=U+086F
598 /// 0x1DF8..=0x1DF8, // COMBINING DOT ABOVE LEFT
599 /// 0x1DFA..=0x1DFA, // COMBINING DOT BELOW LEFT
600 /// ];
601 ///
602 /// assert_eq!(
603 /// syriac_script_extensions_ranges.collect::<Vec<_>>(),
604 /// exp_ranges
605 /// );
606 /// ```
607 pub fn get_script_extensions_ranges(
608 self,
609 script: Script,
610 ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
611 self.data
612 .trie
613 .iter_ranges_mapped(move |value| {
614 let sc_with_ext = ScriptWithExt(value.0);
615 if sc_with_ext.has_extensions() {
616 self.get_scx_val_using_trie_val(&sc_with_ext.to_unaligned())
617 .iter()
618 .any(|sc| sc == script)
619 } else {
620 script == sc_with_ext.into()
621 }
622 })
623 .filter(|v| v.value)
624 .map(|v| v.range)
625 }
626
627 /// Returns a [`CodePointInversionList`] for the given [`Script`] which represents all
628 /// code points for which `has_script` will return true.
629 ///
630 /// ✨ *Enabled with the `alloc` Cargo feature.*
631 ///
632 /// # Examples
633 ///
634 /// ```
635 /// use icu::properties::script::ScriptWithExtensions;
636 /// use icu::properties::props::Script;
637 ///
638 /// let swe = ScriptWithExtensions::new();
639 ///
640 /// let syriac = swe.get_script_extensions_set(Script::Syriac);
641 ///
642 /// assert!(!syriac.contains('؞')); // ARABIC TRIPLE DOT PUNCTUATION MARK
643 /// assert!(syriac.contains('؟')); // ARABIC QUESTION MARK
644 /// assert!(!syriac.contains('ؠ')); // ARABIC LETTER KASHMIRI YEH
645 ///
646 /// assert!(syriac.contains('܀')); // SYRIAC END OF PARAGRAPH
647 /// assert!(syriac.contains('\u{074A}')); // SYRIAC BARREKH
648 /// assert!(!syriac.contains('\u{074B}')); // unassigned
649 /// assert!(syriac.contains('ݏ')); // SYRIAC LETTER SOGDIAN FE
650 /// assert!(!syriac.contains('ݐ')); // ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW
651 ///
652 /// assert!(syriac.contains('\u{1DF8}')); // COMBINING DOT ABOVE LEFT
653 /// assert!(!syriac.contains('\u{1DF9}')); // COMBINING WIDE INVERTED BRIDGE BELOW
654 /// assert!(syriac.contains('\u{1DFA}')); // COMBINING DOT BELOW LEFT
655 /// assert!(!syriac.contains('\u{1DFB}')); // COMBINING DELETION MARK
656 /// ```
657 #[cfg(feature = "alloc")]
658 pub fn get_script_extensions_set(self, script: Script) -> CodePointInversionList<'a> {
659 CodePointInversionList::from_iter(self.get_script_extensions_ranges(script))
660 }
661}
662
663#[cfg(feature = "compiled_data")]
664impl Default for ScriptWithExtensionsBorrowed<'static> {
665 fn default() -> Self {
666 Self::new()
667 }
668}
669
670impl ScriptWithExtensionsBorrowed<'static> {
671 /// Creates a new instance of `ScriptWithExtensionsBorrowed` using compiled data.
672 ///
673 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
674 ///
675 /// [📚 Help choosing a constructor](icu_provider::constructors)
676 #[cfg(feature = "compiled_data")]
677 pub fn new() -> Self {
678 Self {
679 data: Baked::SINGLETON_PROPERTY_SCRIPT_WITH_EXTENSIONS_V1,
680 }
681 }
682
683 /// Cheaply converts a [`ScriptWithExtensionsBorrowed<'static>`] into a [`ScriptWithExtensions`].
684 ///
685 /// Note: Due to branching and indirection, using [`ScriptWithExtensions`] might inhibit some
686 /// compile-time optimizations that are possible with [`ScriptWithExtensionsBorrowed`].
687 pub const fn static_to_owned(self) -> ScriptWithExtensions {
688 ScriptWithExtensions {
689 data: DataPayload::from_static_ref(self.data),
690 }
691 }
692}
693
694#[cfg(test)]
695mod tests {
696 use super::*;
697 #[test]
698 /// Regression test for <https://github.com/unicode-org/icu4x/issues/6041>
699 fn test_scx_regression_6041() {
700 let scripts = ScriptWithExtensions::new()
701 .get_script_extensions_val('\u{2bc}')
702 .iter()
703 .collect::<Vec<_>>();
704 assert_eq!(
705 scripts,
706 [
707 Script::Bengali,
708 Script::Cyrillic,
709 Script::Devanagari,
710 Script::Latin,
711 Script::Thai,
712 Script::Lisu,
713 Script::Toto
714 ]
715 );
716 }
717}