icu_collections/codepointinvlist/
cpinvlist.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5#[cfg(feature = "serde")]
6use alloc::format;
7#[cfg(feature = "serde")]
8use alloc::string::String;
9use alloc::vec::Vec;
10use core::{char, ops::RangeBounds, ops::RangeInclusive};
11use yoke::Yokeable;
12use zerofrom::ZeroFrom;
13use zerovec::{ule::AsULE, zerovec, ZeroVec};
14
15use super::CodePointInversionListError;
16use crate::codepointinvlist::utils::{deconstruct_range, is_valid_zv};
17
18/// Represents the end code point of the Basic Multilingual Plane range, starting from code point 0, inclusive
19const BMP_MAX: u32 = 0xFFFF;
20
21/// Represents the inversion list for a set of all code points in the Basic Multilingual Plane.
22const BMP_INV_LIST_VEC: ZeroVec<u32> =
23    zerovec!(u32; <u32 as AsULE>::ULE::from_unsigned; [0x0, BMP_MAX + 1]);
24
25/// Represents the inversion list for all of the code points in the Unicode range.
26const ALL_VEC: ZeroVec<u32> =
27    zerovec!(u32; <u32 as AsULE>::ULE::from_unsigned; [0x0, (char::MAX as u32) + 1]);
28
29/// A membership wrapper for [`CodePointInversionList`].
30///
31/// Provides exposure to membership functions and constructors from serialized `CodePointSet`s (sets of code points)
32/// and predefined ranges.
33#[zerovec::make_varule(CodePointInversionListULE)]
34#[zerovec::skip_derive(Ord)]
35#[zerovec::derive(Debug)]
36#[derive(Debug, Eq, PartialEq, Clone, Yokeable, ZeroFrom)]
37pub struct CodePointInversionList<'data> {
38    // If we wanted to use an array to keep the memory on the stack, there is an unsafe nightly feature
39    // https://doc.rust-lang.org/nightly/core/array/trait.FixedSizeArray.html
40    // Allows for traits of fixed size arrays
41
42    // Implements an [inversion list.](https://en.wikipedia.org/wiki/Inversion_list)
43    inv_list: ZeroVec<'data, u32>,
44    size: u32,
45}
46
47#[cfg(feature = "serde")]
48impl<'de: 'a, 'a> serde::Deserialize<'de> for CodePointInversionList<'a> {
49    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
50    where
51        D: serde::Deserializer<'de>,
52    {
53        use serde::de::Error;
54
55        let parsed_inv_list = if deserializer.is_human_readable() {
56            #[derive(serde::Deserialize)]
57            #[serde(untagged)]
58            pub enum De<'data> {
59                // TODO(#2856): Remove in ICU4X 2.0
60                #[serde(borrow)]
61                OldStyle(ZeroVec<'data, u32>),
62                #[serde(borrow)]
63                NewStyle(Vec<alloc::borrow::Cow<'data, str>>),
64            }
65
66            match De::<'de>::deserialize(deserializer)? {
67                De::OldStyle(parsed_inv_list) => parsed_inv_list,
68                De::NewStyle(parsed_strings) => {
69                    let mut inv_list =
70                        ZeroVec::new_owned(Vec::with_capacity(parsed_strings.len() * 2));
71                    for range in parsed_strings {
72                        fn internal(range: &str) -> Option<(u32, u32)> {
73                            let (start, range) = UnicodeCodePoint::parse(range)?;
74                            if range.is_empty() {
75                                return Some((start.0, start.0));
76                            }
77                            let (hyphen, range) = UnicodeCodePoint::parse(range)?;
78                            if hyphen.0 != '-' as u32 {
79                                return None;
80                            }
81                            let (end, range) = UnicodeCodePoint::parse(range)?;
82                            range.is_empty().then_some((start.0, end.0))
83                        }
84                        let (start, end) = internal(&range).ok_or_else(|| Error::custom(format!(
85                            "Cannot deserialize invalid inversion list for CodePointInversionList: {range:?}"
86                        )))?;
87                        inv_list.with_mut(|v| {
88                            v.push(start.to_unaligned());
89                            v.push((end + 1).to_unaligned());
90                        });
91                    }
92                    inv_list
93                }
94            }
95        } else {
96            ZeroVec::<u32>::deserialize(deserializer)?
97        };
98        CodePointInversionList::try_from_inversion_list(parsed_inv_list).map_err(|e| {
99            Error::custom(format!(
100                "Cannot deserialize invalid inversion list for CodePointInversionList: {e:?}"
101            ))
102        })
103    }
104}
105
106#[cfg(feature = "databake")]
107impl databake::Bake for CodePointInversionList<'_> {
108    fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
109        env.insert("icu_collections");
110        let inv_list = self.inv_list.bake(env);
111        let size = self.size.bake(env);
112        // Safe because our parts are safe.
113        databake::quote! { unsafe {
114            #[allow(unused_unsafe)]
115            icu_collections::codepointinvlist::CodePointInversionList::from_parts_unchecked(#inv_list, #size)
116        }}
117    }
118}
119
120#[cfg(feature = "serde")]
121#[derive(Debug, Copy, Clone)]
122struct UnicodeCodePoint(u32);
123
124#[cfg(feature = "serde")]
125impl UnicodeCodePoint {
126    fn from_u32(cp: u32) -> Result<Self, String> {
127        if cp <= char::MAX as u32 {
128            Ok(Self(cp))
129        } else {
130            Err(format!("Not a Unicode code point {}", cp))
131        }
132    }
133
134    fn parse(value: &str) -> Option<(Self, &str)> {
135        Some(if let Some(hex) = value.strip_prefix("U+") {
136            let (escape, remainder) = (hex.get(..4)?, hex.get(4..)?);
137            (Self(u32::from_str_radix(escape, 16).ok()?), remainder)
138        } else {
139            let c = value.chars().next()?;
140            (Self(c as u32), value.get(c.len_utf8()..)?)
141        })
142    }
143}
144
145#[cfg(feature = "serde")]
146impl core::fmt::Display for UnicodeCodePoint {
147    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
148        match self.0 {
149            s @ 0xD800..=0xDFFF => write!(f, "U+{s:X}"),
150            // SAFETY: c <= char::MAX by construction, and not a surrogate
151            c => write!(f, "{}", unsafe { char::from_u32_unchecked(c) }),
152        }
153    }
154}
155
156#[cfg(feature = "serde")]
157impl<'data> serde::Serialize for CodePointInversionList<'data> {
158    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
159    where
160        S: serde::Serializer,
161    {
162        if serializer.is_human_readable() {
163            use serde::ser::Error;
164            use serde::ser::SerializeSeq;
165            let mut seq = serializer.serialize_seq(Some(self.inv_list.len() / 2))?;
166            for range in self.iter_ranges() {
167                let start = UnicodeCodePoint::from_u32(*range.start()).map_err(S::Error::custom)?;
168                if range.start() == range.end() {
169                    seq.serialize_element(&format!("{start}"))?;
170                } else {
171                    let end = UnicodeCodePoint::from_u32(*range.end()).map_err(S::Error::custom)?;
172                    seq.serialize_element(&format!("{start}-{end}",))?;
173                }
174            }
175            seq.end()
176        } else {
177            // Note: serde(flatten) currently does not promote a struct field of type Vec
178            // to replace the struct when serializing. The error message from the default
179            // serialization is: "can only flatten structs and maps (got a sequence)".
180            self.inv_list.serialize(serializer)
181        }
182    }
183}
184
185impl<'data> CodePointInversionList<'data> {
186    /// Returns a new [`CodePointInversionList`] from an [inversion list](https://en.wikipedia.org/wiki/Inversion_list)
187    /// represented as a [`ZeroVec`]`<`[`u32`]`>` of code points.
188    ///
189    /// The inversion list must be of even length, sorted ascending non-overlapping,
190    /// and within the bounds of `0x0 -> 0x10FFFF` inclusive, and end points being exclusive.
191    ///
192    /// # Examples
193    ///
194    /// ```
195    /// use icu::collections::codepointinvlist::CodePointInversionList;
196    /// use icu::collections::codepointinvlist::CodePointInversionListError;
197    /// use zerovec::ZeroVec;
198    /// let valid = [0x0, 0x10000];
199    /// let inv_list: ZeroVec<u32> = ZeroVec::from_slice_or_alloc(&valid);
200    /// let result = CodePointInversionList::try_from_inversion_list(inv_list);
201    /// assert!(matches!(result, CodePointInversionList));
202    ///
203    /// let invalid: Vec<u32> = vec![0x0, 0x80, 0x3];
204    /// let inv_list: ZeroVec<u32> = ZeroVec::from_slice_or_alloc(&invalid);
205    /// let result = CodePointInversionList::try_from_inversion_list(inv_list);
206    /// assert!(matches!(
207    ///     result,
208    ///     Err(CodePointInversionListError::InvalidSet(_))
209    /// ));
210    /// if let Err(CodePointInversionListError::InvalidSet(actual)) = result {
211    ///     assert_eq!(&invalid, &actual);
212    /// }
213    /// ```
214    pub fn try_from_inversion_list(
215        inv_list: ZeroVec<'data, u32>,
216    ) -> Result<Self, CodePointInversionListError> {
217        #[allow(clippy::indexing_slicing)] // chunks
218        if is_valid_zv(&inv_list) {
219            let size = inv_list
220                .as_ule_slice()
221                .chunks(2)
222                .map(|end_points| {
223                    <u32 as AsULE>::from_unaligned(end_points[1])
224                        - <u32 as AsULE>::from_unaligned(end_points[0])
225                })
226                .sum::<u32>();
227            Ok(Self { inv_list, size })
228        } else {
229            Err(CodePointInversionListError::InvalidSet(inv_list.to_vec()))
230        }
231    }
232
233    #[doc(hidden)] // databake internal
234    pub const unsafe fn from_parts_unchecked(inv_list: ZeroVec<'data, u32>, size: u32) -> Self {
235        Self { inv_list, size }
236    }
237
238    /// Returns a new [`CodePointInversionList`] by borrowing an [inversion list](https://en.wikipedia.org/wiki/Inversion_list)
239    /// represented as a slice of [`u32`] code points.
240    ///
241    /// The inversion list must be of even length, sorted ascending non-overlapping,
242    /// and within the bounds of `0x0 -> 0x10FFFF` inclusive, and end points being exclusive.
243    ///
244    /// Note: The slice may be cloned on certain platforms; for more information, see [`ZeroVec::from_slice_or_alloc`].
245    ///
246    /// # Examples
247    ///
248    /// ```
249    /// use icu::collections::codepointinvlist::CodePointInversionList;
250    /// use icu::collections::codepointinvlist::CodePointInversionListError;
251    /// let valid = [0x0, 0x10000];
252    /// let result = CodePointInversionList::try_from_inversion_list_slice(&valid);
253    /// assert!(matches!(result, CodePointInversionList));
254    ///
255    /// let invalid: Vec<u32> = vec![0x0, 0x80, 0x3];
256    /// let result =
257    ///     CodePointInversionList::try_from_inversion_list_slice(&invalid);
258    /// assert!(matches!(
259    ///     result,
260    ///     Err(CodePointInversionListError::InvalidSet(_))
261    /// ));
262    /// if let Err(CodePointInversionListError::InvalidSet(actual)) = result {
263    ///     assert_eq!(&invalid, &actual);
264    /// }
265    /// ```
266    pub fn try_from_inversion_list_slice(
267        inv_list: &'data [u32],
268    ) -> Result<Self, CodePointInversionListError> {
269        let inv_list_zv: ZeroVec<u32> = ZeroVec::from_slice_or_alloc(inv_list);
270        CodePointInversionList::try_from_inversion_list(inv_list_zv)
271    }
272
273    /// Returns a new, fully-owned [`CodePointInversionList`] by cloning an [inversion list](https://en.wikipedia.org/wiki/Inversion_list)
274    /// represented as a slice of [`u32`] code points.
275    ///
276    /// The inversion list must be of even length, sorted ascending non-overlapping,
277    /// and within the bounds of `0x0 -> 0x10FFFF` inclusive, and end points being exclusive.
278    ///
279    /// # Examples
280    ///
281    /// ```
282    /// use icu::collections::codepointinvlist::CodePointInversionList;
283    ///
284    /// let bmp_list = &[0x0, 0x10000];
285    /// let smp_list = &[0x10000, 0x20000];
286    /// let sip_list = &[0x20000, 0x30000];
287    ///
288    /// let lists: Vec<CodePointInversionList> =
289    ///     [&bmp_list[..], smp_list, sip_list]
290    ///         .into_iter()
291    ///         .map(|l| {
292    ///             CodePointInversionList::try_clone_from_inversion_list_slice(l)
293    ///                 .unwrap()
294    ///         })
295    ///         .collect();
296    ///
297    /// let bmp = &lists[0];
298    /// assert!(bmp.contains32(0xFFFF));
299    /// assert!(!bmp.contains32(0x10000));
300    ///
301    /// assert!(!lists.iter().any(|set| set.contains32(0x40000)));
302    /// ```
303    pub fn try_clone_from_inversion_list_slice(
304        inv_list: &[u32],
305    ) -> Result<Self, CodePointInversionListError> {
306        let inv_list_zv: ZeroVec<u32> = ZeroVec::alloc_from_slice(inv_list);
307        CodePointInversionList::try_from_inversion_list(inv_list_zv)
308    }
309
310    /// Attempts to convert this list into a fully-owned one. No-op if already fully owned
311    pub fn into_owned(self) -> CodePointInversionList<'static> {
312        CodePointInversionList {
313            inv_list: self.inv_list.into_owned(),
314            size: self.size,
315        }
316    }
317
318    /// Returns an owned inversion list representing the current [`CodePointInversionList`]
319    pub fn get_inversion_list_vec(&self) -> Vec<u32> {
320        let result: Vec<u32> = self.as_inversion_list().to_vec(); // Only crate public, to not leak impl
321        result
322    }
323
324    /// Returns [`CodePointInversionList`] spanning entire Unicode range
325    ///
326    /// The range spans from `0x0 -> 0x10FFFF` inclusive.
327    ///  
328    /// # Examples
329    ///
330    /// ```
331    /// use icu::collections::codepointinvlist::CodePointInversionList;
332    ///
333    /// let expected = [0x0, (char::MAX as u32) + 1];
334    /// assert_eq!(
335    ///     CodePointInversionList::all().get_inversion_list_vec(),
336    ///     expected
337    /// );
338    /// assert_eq!(
339    ///     CodePointInversionList::all().size(),
340    ///     (expected[1] - expected[0]) as usize
341    /// );
342    /// ```
343    pub fn all() -> Self {
344        Self {
345            inv_list: ALL_VEC,
346            size: (char::MAX as u32) + 1,
347        }
348    }
349
350    /// Returns [`CodePointInversionList`] spanning BMP range
351    ///
352    /// The range spans from `0x0 -> 0xFFFF` inclusive.
353    ///
354    /// # Examples
355    ///
356    /// ```
357    /// use icu::collections::codepointinvlist::CodePointInversionList;
358    ///
359    /// const BMP_MAX: u32 = 0xFFFF;
360    ///
361    /// let expected = [0x0, BMP_MAX + 1];
362    /// assert_eq!(
363    ///     CodePointInversionList::bmp().get_inversion_list_vec(),
364    ///     expected
365    /// );
366    /// assert_eq!(
367    ///     CodePointInversionList::bmp().size(),
368    ///     (expected[1] - expected[0]) as usize
369    /// );
370    /// ```
371    pub fn bmp() -> Self {
372        Self {
373            inv_list: BMP_INV_LIST_VEC,
374            size: BMP_MAX + 1,
375        }
376    }
377
378    /// Returns the inversion list as a slice
379    ///
380    /// Public only to the crate, not exposed to public
381    pub(crate) fn as_inversion_list(&self) -> &ZeroVec<u32> {
382        &self.inv_list
383    }
384
385    /// Yields an [`Iterator`] going through the character set in the [`CodePointInversionList`]
386    ///
387    /// # Examples
388    ///
389    /// ```
390    /// use icu::collections::codepointinvlist::CodePointInversionList;
391    /// let example_list = [0x41, 0x44, 0x45, 0x46];
392    /// let example =
393    ///     CodePointInversionList::try_from_inversion_list_slice(&example_list)
394    ///         .unwrap();
395    /// let mut ex_iter_chars = example.iter_chars();
396    /// assert_eq!(Some('A'), ex_iter_chars.next());
397    /// assert_eq!(Some('B'), ex_iter_chars.next());
398    /// assert_eq!(Some('C'), ex_iter_chars.next());
399    /// assert_eq!(Some('E'), ex_iter_chars.next());
400    /// assert_eq!(None, ex_iter_chars.next());
401    /// ```
402    pub fn iter_chars(&self) -> impl Iterator<Item = char> + '_ {
403        #[allow(clippy::indexing_slicing)] // chunks
404        self.inv_list
405            .as_ule_slice()
406            .chunks(2)
407            .flat_map(|pair| (AsULE::from_unaligned(pair[0])..AsULE::from_unaligned(pair[1])))
408            .filter_map(char::from_u32)
409    }
410
411    /// Yields an [`Iterator`] returning the ranges of the code points that are
412    /// included in the [`CodePointInversionList`]
413    ///
414    /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
415    /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
416    /// behavior of ranges, ex: `CodePointInversionList::contains(UChar32 start, UChar32 end)`.
417    ///
418    /// # Example
419    ///
420    /// ```
421    /// use icu::collections::codepointinvlist::CodePointInversionList;
422    /// let example_list = [0x41, 0x44, 0x45, 0x46];
423    /// let example =
424    ///     CodePointInversionList::try_from_inversion_list_slice(&example_list)
425    ///         .unwrap();
426    /// let mut example_iter_ranges = example.iter_ranges();
427    /// assert_eq!(Some(0x41..=0x43), example_iter_ranges.next());
428    /// assert_eq!(Some(0x45..=0x45), example_iter_ranges.next());
429    /// assert_eq!(None, example_iter_ranges.next());
430    /// ```
431    pub fn iter_ranges(&self) -> impl ExactSizeIterator<Item = RangeInclusive<u32>> + '_ {
432        #[allow(clippy::indexing_slicing)] // chunks
433        self.inv_list.as_ule_slice().chunks(2).map(|pair| {
434            let range_start: u32 = AsULE::from_unaligned(pair[0]);
435            let range_limit: u32 = AsULE::from_unaligned(pair[1]);
436            RangeInclusive::new(range_start, range_limit - 1)
437        })
438    }
439
440    /// Yields an [`Iterator`] returning the ranges of the code points that are
441    /// *not* included in the [`CodePointInversionList`]
442    ///
443    /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
444    /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
445    /// behavior of ranges, ex: `CodePointInversionList::contains(UChar32 start, UChar32 end)`.
446    ///
447    /// # Example
448    ///
449    /// ```
450    /// use icu::collections::codepointinvlist::CodePointInversionList;
451    /// let example_list = [0x41, 0x44, 0x45, 0x46];
452    /// let example =
453    ///     CodePointInversionList::try_from_inversion_list_slice(&example_list)
454    ///         .unwrap();
455    /// let mut example_iter_ranges = example.iter_ranges_complemented();
456    /// assert_eq!(Some(0..=0x40), example_iter_ranges.next());
457    /// assert_eq!(Some(0x44..=0x44), example_iter_ranges.next());
458    /// assert_eq!(Some(0x46..=char::MAX as u32), example_iter_ranges.next());
459    /// assert_eq!(None, example_iter_ranges.next());
460    /// ```
461    pub fn iter_ranges_complemented(&self) -> impl Iterator<Item = RangeInclusive<u32>> + '_ {
462        let inv_ule = self.inv_list.as_ule_slice();
463        let middle = inv_ule.get(1..inv_ule.len() - 1).unwrap_or(&[]);
464        let beginning = if let Some(first) = self.inv_list.first() {
465            if first == 0 {
466                None
467            } else {
468                Some(0..=first - 1)
469            }
470        } else {
471            None
472        };
473        let end = if let Some(last) = self.inv_list.last() {
474            if last == char::MAX as u32 {
475                None
476            } else {
477                Some(last..=char::MAX as u32)
478            }
479        } else {
480            None
481        };
482        #[allow(clippy::indexing_slicing)] // chunks
483        let chunks = middle.chunks(2).map(|pair| {
484            let range_start: u32 = AsULE::from_unaligned(pair[0]);
485            let range_limit: u32 = AsULE::from_unaligned(pair[1]);
486            RangeInclusive::new(range_start, range_limit - 1)
487        });
488        beginning.into_iter().chain(chunks).chain(end)
489    }
490
491    /// Returns the number of ranges contained in this [`CodePointInversionList`]
492    pub fn get_range_count(&self) -> usize {
493        self.inv_list.len() / 2
494    }
495
496    /// Returns a specific range contained in this [`CodePointInversionList`] by index.
497    /// Intended for use in FFI.
498    pub fn get_nth_range(&self, idx: usize) -> Option<RangeInclusive<u32>> {
499        let start_idx = idx * 2;
500        let end_idx = start_idx + 1;
501        let start = self.inv_list.get(start_idx)?;
502        let end = self.inv_list.get(end_idx)?;
503        Some(RangeInclusive::new(start, end - 1))
504    }
505
506    /// Returns the number of elements of the [`CodePointInversionList`]
507    pub fn size(&self) -> usize {
508        if self.is_empty() {
509            return 0;
510        }
511        self.size as usize
512    }
513
514    /// Returns whether or not the [`CodePointInversionList`] is empty
515    pub fn is_empty(&self) -> bool {
516        self.inv_list.is_empty()
517    }
518
519    /// Wrapper for contains
520    ///
521    /// Returns an [`Option`] as to whether or not it is possible for the query to be contained.
522    /// The value in the [`Option`] is the start index of the range that contains the query.
523    fn contains_query(&self, query: u32) -> Option<usize> {
524        match self.inv_list.binary_search(&query) {
525            Ok(pos) => {
526                if pos % 2 == 0 {
527                    Some(pos)
528                } else {
529                    None
530                }
531            }
532            Err(pos) => {
533                if pos % 2 != 0 && pos < self.inv_list.len() {
534                    Some(pos - 1)
535                } else {
536                    None
537                }
538            }
539        }
540    }
541
542    /// Checks to see the query is in the [`CodePointInversionList`]
543    ///
544    /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points
545    /// in the set using [`core`] implementation
546    ///
547    /// # Examples
548    ///
549    /// ```
550    /// use icu::collections::codepointinvlist::CodePointInversionList;
551    /// let example_list = [0x41, 0x43, 0x44, 0x45];
552    /// let example =
553    ///     CodePointInversionList::try_from_inversion_list_slice(&example_list)
554    ///         .unwrap();
555    /// assert!(example.contains('A'));
556    /// assert!(!example.contains('C'));
557    /// ```
558    pub fn contains(&self, query: char) -> bool {
559        self.contains_query(query as u32).is_some()
560    }
561
562    /// Checks to see the unsigned int is in the [`CodePointInversionList::all()`](CodePointInversionList::all())
563    ///
564    /// Note: Even though [`u32`] and [`prim@char`] in Rust are non-negative 4-byte
565    /// values, there is an important difference. A [`u32`] can take values up to
566    /// a very large integer value, while a [`prim@char`] in Rust is defined to be in
567    /// the range from 0 to the maximum valid Unicode Scalar Value.
568    ///
569    /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points
570    /// in the set using [`core`] implementation
571    ///
572    /// # Examples
573    ///
574    /// ```
575    /// use icu::collections::codepointinvlist::CodePointInversionList;
576    /// let example_list = [0x41, 0x43, 0x44, 0x45];
577    /// let example =
578    ///     CodePointInversionList::try_from_inversion_list_slice(&example_list)
579    ///         .unwrap();
580    /// assert!(example.contains32(0x41));
581    /// assert!(!example.contains32(0x43));
582    /// ```
583    pub fn contains32(&self, query: u32) -> bool {
584        self.contains_query(query).is_some()
585    }
586
587    /// Checks to see if the range is in the [`CodePointInversionList`]
588    ///
589    /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points
590    /// in the set using [`Vec`] implementation. Only runs the search once on the `start`
591    /// parameter, while the `end` parameter is checked in a single `O(1)` step.
592    ///
593    /// # Examples
594    ///
595    /// ```
596    /// use icu::collections::codepointinvlist::CodePointInversionList;
597    /// let example_list = [0x41, 0x43, 0x44, 0x45];
598    /// let example =
599    ///     CodePointInversionList::try_from_inversion_list_slice(&example_list)
600    ///         .unwrap();
601    /// assert!(example.contains_range(&('A'..'C')));
602    /// assert!(example.contains_range(&('A'..='B')));
603    /// assert!(!example.contains_range(&('A'..='C')));
604    /// ```
605    ///
606    /// Surrogate points (`0xD800 -> 0xDFFF`) will return [`false`] if the Range contains them but the
607    /// [`CodePointInversionList`] does not.
608    ///
609    /// Note: when comparing to ICU4C/J, keep in mind that `Range`s in Rust are
610    /// constructed inclusive of start boundary and exclusive of end boundary.
611    /// The ICU4C/J `CodePointInversionList::contains(UChar32 start, UChar32 end)` method
612    /// differs by including the end boundary.
613    ///
614    /// # Examples
615    ///
616    /// ```
617    /// use icu::collections::codepointinvlist::CodePointInversionList;
618    /// use std::char;
619    /// let check =
620    ///     char::from_u32(0xD7FE).unwrap()..char::from_u32(0xE001).unwrap();
621    /// let example_list = [0xD7FE, 0xD7FF, 0xE000, 0xE001];
622    /// let example =
623    ///     CodePointInversionList::try_from_inversion_list_slice(&example_list)
624    ///         .unwrap();
625    /// assert!(!example.contains_range(&(check)));
626    /// ```
627    pub fn contains_range(&self, range: &impl RangeBounds<char>) -> bool {
628        let (from, till) = deconstruct_range(range);
629        if from >= till {
630            return false;
631        }
632        match self.contains_query(from) {
633            Some(pos) => {
634                if let Some(x) = self.inv_list.get(pos + 1) {
635                    (till) <= x
636                } else {
637                    debug_assert!(
638                        false,
639                        "Inversion list query should not return out of bounds index"
640                    );
641                    false
642                }
643            }
644            None => false,
645        }
646    }
647
648    /// Check if the calling [`CodePointInversionList`] contains all the characters of the given [`CodePointInversionList`]
649    ///
650    /// # Examples
651    ///
652    /// ```
653    /// use icu::collections::codepointinvlist::CodePointInversionList;
654    /// let example_list = [0x41, 0x46, 0x55, 0x5B]; // A - E, U - Z
655    /// let example =
656    ///     CodePointInversionList::try_from_inversion_list_slice(&example_list)
657    ///         .unwrap();
658    /// let a_to_d =
659    ///     CodePointInversionList::try_from_inversion_list_slice(&[0x41, 0x45])
660    ///         .unwrap();
661    /// let f_to_t =
662    ///     CodePointInversionList::try_from_inversion_list_slice(&[0x46, 0x55])
663    ///         .unwrap();
664    /// let r_to_x =
665    ///     CodePointInversionList::try_from_inversion_list_slice(&[0x52, 0x58])
666    ///         .unwrap();
667    /// assert!(example.contains_set(&a_to_d)); // contains all
668    /// assert!(!example.contains_set(&f_to_t)); // contains none
669    /// assert!(!example.contains_set(&r_to_x)); // contains some
670    /// ```
671    pub fn contains_set(&self, set: &Self) -> bool {
672        if set.size() > self.size() {
673            return false;
674        }
675
676        let mut set_ranges = set.iter_ranges();
677        let mut check_elem = set_ranges.next();
678
679        let ranges = self.iter_ranges();
680        for range in ranges {
681            match check_elem {
682                Some(ref check_range) => {
683                    if check_range.start() >= range.start()
684                        && check_range.end() <= &(range.end() + 1)
685                    {
686                        check_elem = set_ranges.next();
687                    }
688                }
689                _ => break,
690            }
691        }
692        check_elem.is_none()
693    }
694
695    /// Returns the end of the initial substring where the characters are either contained/not contained
696    /// in the set.
697    ///
698    /// # Examples
699    ///
700    /// ```
701    /// use icu::collections::codepointinvlist::CodePointInversionList;
702    /// let example_list = [0x41, 0x44]; // {A, B, C}
703    /// let example =
704    ///     CodePointInversionList::try_from_inversion_list_slice(&example_list)
705    ///         .unwrap();
706    /// assert_eq!(example.span("CABXYZ", true), 3);
707    /// assert_eq!(example.span("XYZC", false), 3);
708    /// assert_eq!(example.span("XYZ", true), 0);
709    /// assert_eq!(example.span("ABC", false), 0);
710    /// ```
711    pub fn span(&self, span_str: &str, contained: bool) -> usize {
712        span_str
713            .chars()
714            .take_while(|&x| self.contains(x) == contained)
715            .count()
716    }
717
718    /// Returns the start of the trailing substring (starting from end of string) where the characters are
719    /// either contained/not contained in the set. Returns the length of the string if no valid return.
720    ///
721    /// # Examples
722    ///
723    /// ```
724    /// use icu::collections::codepointinvlist::CodePointInversionList;
725    /// let example_list = [0x41, 0x44]; // {A, B, C}
726    /// let example =
727    ///     CodePointInversionList::try_from_inversion_list_slice(&example_list)
728    ///         .unwrap();
729    /// assert_eq!(example.span_back("XYZCAB", true), 3);
730    /// assert_eq!(example.span_back("ABCXYZ", true), 6);
731    /// assert_eq!(example.span_back("CABXYZ", false), 3);
732    /// ```
733    pub fn span_back(&self, span_str: &str, contained: bool) -> usize {
734        span_str.len()
735            - span_str
736                .chars()
737                .rev()
738                .take_while(|&x| self.contains(x) == contained)
739                .count()
740    }
741}
742
743#[cfg(test)]
744mod tests {
745    use super::{CodePointInversionList, CodePointInversionListError};
746    use std::{char, vec::Vec};
747    use zerovec::ZeroVec;
748
749    #[test]
750    fn test_codepointinversionlist_try_from_vec() {
751        let ex = vec![0x2, 0x3, 0x4, 0x5];
752        let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
753        assert_eq!(ex, check.get_inversion_list_vec());
754        assert_eq!(2, check.size());
755    }
756
757    #[test]
758    fn test_codepointinversionlist_try_from_vec_error() {
759        let check = vec![0x1, 0x1, 0x2, 0x3, 0x4];
760        let inv_list = ZeroVec::from_slice_or_alloc(&check);
761        let set = CodePointInversionList::try_from_inversion_list(inv_list);
762        assert!(matches!(
763            set,
764            Err(CodePointInversionListError::InvalidSet(_))
765        ));
766        if let Err(CodePointInversionListError::InvalidSet(actual)) = set {
767            assert_eq!(&check, &actual);
768        }
769    }
770
771    // CodePointInversionList membership functions
772    #[test]
773    fn test_codepointinversionlist_contains_query() {
774        let ex = vec![0x41, 0x46, 0x4B, 0x55];
775        let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
776        assert!(check.contains_query(0x40).is_none());
777        assert_eq!(check.contains_query(0x41).unwrap(), 0);
778        assert_eq!(check.contains_query(0x44).unwrap(), 0);
779        assert!(check.contains_query(0x46).is_none());
780        assert_eq!(check.contains_query(0x4C).unwrap(), 2);
781        assert!(check.contains_query(0x56).is_none());
782    }
783
784    #[test]
785    fn test_codepointinversionlist_contains() {
786        let ex = vec![0x2, 0x5, 0xA, 0xF];
787        let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
788        assert!(check.contains(0x2 as char));
789        assert!(check.contains(0x4 as char));
790        assert!(check.contains(0xA as char));
791        assert!(check.contains(0xE as char));
792    }
793
794    #[test]
795    fn test_codepointinversionlist_contains_false() {
796        let ex = vec![0x2, 0x5, 0xA, 0xF];
797        let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
798        assert!(!check.contains(0x1 as char));
799        assert!(!check.contains(0x5 as char));
800        assert!(!check.contains(0x9 as char));
801        assert!(!check.contains(0xF as char));
802        assert!(!check.contains(0x10 as char));
803    }
804
805    #[test]
806    fn test_codepointinversionlist_contains_range() {
807        let ex = vec![0x41, 0x46, 0x4B, 0x55];
808        let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
809        assert!(check.contains_range(&('A'..='E'))); // 65 - 69
810        assert!(check.contains_range(&('C'..'D'))); // 67 - 67
811        assert!(check.contains_range(&('L'..'P'))); // 76 - 80
812        assert!(!check.contains_range(&('L'..='U'))); // 76 - 85
813    }
814
815    #[test]
816    fn test_codepointinversionlist_contains_range_false() {
817        let ex = vec![0x41, 0x46, 0x4B, 0x55];
818        let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
819        assert!(!check.contains_range(&('!'..'A'))); // 33 - 65
820        assert!(!check.contains_range(&('F'..'K'))); // 70 - 74
821        assert!(!check.contains_range(&('U'..))); // 85 - ..
822    }
823
824    #[test]
825    fn test_codepointinversionlist_contains_range_invalid() {
826        let check = CodePointInversionList::all();
827        assert!(!check.contains_range(&('A'..'!'))); // 65 - 33
828        assert!(!check.contains_range(&('A'..'A'))); // 65 - 65
829    }
830
831    #[test]
832    fn test_codepointinversionlist_contains_set_u() {
833        let ex = vec![0xA, 0x14, 0x28, 0x32, 0x46, 0x50, 0x64, 0x6E];
834        let u = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
835        let inside = vec![0xF, 0x14, 0x2C, 0x31, 0x46, 0x50, 0x64, 0x6D];
836        let s = CodePointInversionList::try_from_inversion_list_slice(&inside).unwrap();
837        assert!(u.contains_set(&s));
838    }
839
840    #[test]
841    fn test_codepointinversionlist_contains_set_u_false() {
842        let ex = vec![0xA, 0x14, 0x28, 0x32, 0x46, 0x50, 0x64, 0x78];
843        let u = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
844        let outside = vec![0x0, 0xA, 0x16, 0x2C, 0x32, 0x46, 0x4F, 0x51, 0x6D, 0x6F];
845        let s = CodePointInversionList::try_from_inversion_list_slice(&outside).unwrap();
846        assert!(!u.contains_set(&s));
847    }
848
849    #[test]
850    fn test_codepointinversionlist_size() {
851        let ex = vec![0x2, 0x5, 0xA, 0xF];
852        let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
853        assert_eq!(8, check.size());
854        let check = CodePointInversionList::all();
855        let expected = (char::MAX as u32) + 1;
856        assert_eq!(expected as usize, check.size());
857        let inv_list_vec: Vec<u32> = vec![];
858        let check = CodePointInversionList {
859            inv_list: ZeroVec::from_slice_or_alloc(&inv_list_vec),
860            size: 0,
861        };
862        assert_eq!(check.size(), 0);
863    }
864
865    #[test]
866    fn test_codepointinversionlist_is_empty() {
867        let inv_list_vec: Vec<u32> = vec![];
868        let check = CodePointInversionList {
869            inv_list: ZeroVec::from_slice_or_alloc(&inv_list_vec),
870            size: 0,
871        };
872        assert!(check.is_empty());
873    }
874
875    #[test]
876    fn test_codepointinversionlist_is_not_empty() {
877        let check = CodePointInversionList::all();
878        assert!(!check.is_empty());
879    }
880
881    #[test]
882    fn test_codepointinversionlist_iter_chars() {
883        let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
884        let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
885        let mut iter = check.iter_chars();
886        assert_eq!(Some('A'), iter.next());
887        assert_eq!(Some('B'), iter.next());
888        assert_eq!(Some('C'), iter.next());
889        assert_eq!(Some('E'), iter.next());
890        assert_eq!(None, iter.next());
891    }
892
893    #[test]
894    fn test_codepointinversionlist_iter_ranges() {
895        let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
896        let set = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
897        let mut ranges = set.iter_ranges();
898        assert_eq!(Some(0x41..=0x43), ranges.next());
899        assert_eq!(Some(0x45..=0x45), ranges.next());
900        assert_eq!(Some(0xD800..=0xD800), ranges.next());
901        assert_eq!(None, ranges.next());
902    }
903
904    #[test]
905    fn test_codepointinversionlist_iter_ranges_exactsizeiter_trait() {
906        let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
907        let set = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
908        let ranges = set.iter_ranges();
909        assert_eq!(3, ranges.len());
910    }
911
912    #[test]
913    fn test_codepointinversionlist_range_count() {
914        let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
915        let set = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
916        assert_eq!(3, set.get_range_count());
917    }
918
919    #[test]
920    fn test_codepointinversionlist_get_nth_range() {
921        let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
922        let set = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
923        assert_eq!(Some(0x41..=0x43), set.get_nth_range(0));
924        assert_eq!(Some(0x45..=0x45), set.get_nth_range(1));
925        assert_eq!(Some(0xD800..=0xD800), set.get_nth_range(2));
926        assert_eq!(None, set.get_nth_range(3));
927    }
928
929    // Range<char> cannot represent the upper bound (non-inclusive) for
930    // char::MAX, whereas Range<u32> can.
931    #[test]
932    fn test_codepointinversionlist_iter_ranges_with_max_code_point() {
933        let ex = vec![0x80, (char::MAX as u32) + 1];
934        let set = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
935        let mut ranges = set.iter_ranges();
936        assert_eq!(Some(0x80..=(char::MAX as u32)), ranges.next());
937        assert_eq!(None, ranges.next());
938    }
939
940    #[test]
941    fn test_codepointinversionlist_span_contains() {
942        let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
943        let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
944        assert_eq!(check.span("ABCDE", true), 3);
945        assert_eq!(check.span("E", true), 0);
946    }
947
948    #[test]
949    fn test_codepointinversionlist_span_does_not_contain() {
950        let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
951        let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
952        assert_eq!(check.span("DEF", false), 2);
953        assert_eq!(check.span("KLMA", false), 3);
954    }
955
956    #[test]
957    fn test_codepointinversionlist_span_back_contains() {
958        let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
959        let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
960        assert_eq!(check.span_back("XYZABFH", true), 3);
961        assert_eq!(check.span_back("ABCXYZ", true), 6);
962    }
963
964    #[test]
965    fn test_codepointinversionlist_span_back_does_not_contain() {
966        let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
967        let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
968        assert_eq!(check.span_back("ABCXYZ", false), 3);
969        assert_eq!(check.span_back("XYZABC", false), 6);
970    }
971
972    #[test]
973    fn test_uniset_to_inv_list() {
974        let inv_list = [
975            0x9, 0xE, 0x20, 0x21, 0x85, 0x86, 0xA0, 0xA1, 0x1626, 0x1627, 0x2000, 0x2003, 0x2028,
976            0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
977        ];
978        let s: CodePointInversionList =
979            CodePointInversionList::try_from_inversion_list_slice(&inv_list).unwrap();
980        let round_trip_inv_list = s.get_inversion_list_vec();
981        assert_eq!(round_trip_inv_list, inv_list);
982    }
983
984    #[test]
985    fn test_serde_serialize() {
986        let inv_list = [0x41, 0x46, 0x4B, 0x55];
987        let uniset = CodePointInversionList::try_from_inversion_list_slice(&inv_list).unwrap();
988        let json_str = serde_json::to_string(&uniset).unwrap();
989        assert_eq!(json_str, r#"["A-E","K-T"]"#);
990    }
991
992    #[test]
993    fn test_serde_serialize_surrogates() {
994        let inv_list = [0xDFAB, 0xDFFF];
995        let uniset = CodePointInversionList::try_from_inversion_list_slice(&inv_list).unwrap();
996        let json_str = serde_json::to_string(&uniset).unwrap();
997        assert_eq!(json_str, r#"["U+DFAB-U+DFFE"]"#);
998    }
999
1000    #[test]
1001    fn test_serde_deserialize() {
1002        let inv_list_str = r#"["A-E","K-T"]"#;
1003        let exp_inv_list = [0x41, 0x46, 0x4B, 0x55];
1004        let exp_uniset =
1005            CodePointInversionList::try_from_inversion_list_slice(&exp_inv_list).unwrap();
1006        let act_uniset: CodePointInversionList = serde_json::from_str(inv_list_str).unwrap();
1007        assert_eq!(act_uniset, exp_uniset);
1008    }
1009
1010    #[test]
1011    fn test_serde_deserialize_surrogates() {
1012        let inv_list_str = r#"["U+DFAB-U+DFFE"]"#;
1013        let exp_inv_list = [0xDFAB, 0xDFFF];
1014        let exp_uniset =
1015            CodePointInversionList::try_from_inversion_list_slice(&exp_inv_list).unwrap();
1016        let act_uniset: CodePointInversionList = serde_json::from_str(inv_list_str).unwrap();
1017        assert_eq!(act_uniset, exp_uniset);
1018    }
1019
1020    #[test]
1021    fn test_serde_deserialize_legacy() {
1022        let inv_list_str = "[65,70,75,85]";
1023        let exp_inv_list = [0x41, 0x46, 0x4B, 0x55];
1024        let exp_uniset =
1025            CodePointInversionList::try_from_inversion_list_slice(&exp_inv_list).unwrap();
1026        let act_uniset: CodePointInversionList = serde_json::from_str(inv_list_str).unwrap();
1027        assert_eq!(act_uniset, exp_uniset);
1028    }
1029
1030    #[test]
1031    fn test_serde_deserialize_invalid() {
1032        assert!(serde_json::from_str::<CodePointInversionList>("[65,70,98775,85]").is_err());
1033        assert!(serde_json::from_str::<CodePointInversionList>("[65,70,U+FFFFFFFFFF,85]").is_err());
1034    }
1035
1036    #[test]
1037    fn test_serde_with_postcard_roundtrip() -> Result<(), postcard::Error> {
1038        let set = CodePointInversionList::bmp();
1039        let set_serialized: Vec<u8> = postcard::to_allocvec(&set).unwrap();
1040        let set_deserialized: CodePointInversionList =
1041            postcard::from_bytes::<CodePointInversionList>(&set_serialized)?;
1042
1043        assert_eq!(&set, &set_deserialized);
1044        assert!(!set_deserialized.inv_list.is_owned());
1045
1046        Ok(())
1047    }
1048
1049    #[test]
1050    fn databake() {
1051        databake::test_bake!(
1052            CodePointInversionList<'static>,
1053            const: unsafe {
1054                #[allow(unused_unsafe)]
1055                crate::codepointinvlist::CodePointInversionList::from_parts_unchecked(
1056                    unsafe {
1057                        zerovec::ZeroVec::from_bytes_unchecked(
1058                            b"0\0\0\0:\0\0\0A\0\0\0G\0\0\0a\0\0\0g\0\0\0"
1059                        )
1060                    },
1061                    22u32,
1062                )
1063            },
1064            icu_collections,
1065            [zerovec],
1066        );
1067    }
1068}