icu_collections/codepointinvliststringlist/
mod.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! This module provides functionality for querying of sets of Unicode code points and strings.
6//!
7//! It depends on [`CodePointInversionList`] to efficiently represent Unicode code points, while
8//! it also maintains a list of strings in the set.
9//!
10//! It is an implementation of the existing [ICU4C UnicodeSet API](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UnicodeSet.html).
11
12#[cfg(feature = "alloc")]
13use crate::codepointinvlist::CodePointInversionListBuilder;
14use crate::codepointinvlist::{CodePointInversionList, CodePointInversionListULE};
15#[cfg(feature = "alloc")]
16use alloc::string::{String, ToString};
17#[cfg(feature = "alloc")]
18use alloc::vec::Vec;
19use displaydoc::Display;
20use yoke::Yokeable;
21use zerofrom::ZeroFrom;
22use zerovec::{VarZeroSlice, VarZeroVec};
23
24/// A data structure providing a concrete implementation of a set of code points and strings,
25/// using an inversion list for the code points.
26///
27/// This is what ICU4C calls a `UnicodeSet`.
28impl core::fmt::Debug for CodePointInversionListAndStringListULE {
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
        let this =
            <CodePointInversionListAndStringList as
                    zerovec::__zerovec_internal_reexport::ZeroFrom<CodePointInversionListAndStringListULE>>::zero_from(self);
        <CodePointInversionListAndStringList as
                core::fmt::Debug>::fmt(&this, f)
    }
}#[zerovec::make_varule(CodePointInversionListAndStringListULE)]
29#[zerovec::skip_derive(Ord)]
30#[zerovec::derive(Debug)]
31#[derive(#[automatically_derived]
impl<'data> ::core::fmt::Debug for CodePointInversionListAndStringList<'data>
    {
    #[inline]
    fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
        ::core::fmt::Formatter::debug_struct_field2_finish(f,
            "CodePointInversionListAndStringList", "cp_inv_list",
            &self.cp_inv_list, "str_list", &&self.str_list)
    }
}Debug, #[automatically_derived]
impl<'data> ::core::cmp::Eq for CodePointInversionListAndStringList<'data> {
    #[inline]
    #[doc(hidden)]
    #[coverage(off)]
    fn assert_receiver_is_total_eq(&self) -> () {
        let _: ::core::cmp::AssertParamIsEq<CodePointInversionList<'data>>;
        let _: ::core::cmp::AssertParamIsEq<VarZeroVec<'data, str>>;
    }
}Eq, #[automatically_derived]
impl<'data> ::core::cmp::PartialEq for
    CodePointInversionListAndStringList<'data> {
    #[inline]
    fn eq(&self, other: &CodePointInversionListAndStringList<'data>) -> bool {
        self.cp_inv_list == other.cp_inv_list &&
            self.str_list == other.str_list
    }
}PartialEq, #[automatically_derived]
impl<'data> ::core::clone::Clone for
    CodePointInversionListAndStringList<'data> {
    #[inline]
    fn clone(&self) -> CodePointInversionListAndStringList<'data> {
        CodePointInversionListAndStringList {
            cp_inv_list: ::core::clone::Clone::clone(&self.cp_inv_list),
            str_list: ::core::clone::Clone::clone(&self.str_list),
        }
    }
}Clone, unsafe impl<'a> yoke::Yokeable<'a> for
    CodePointInversionListAndStringList<'static> where  {
    type Output = CodePointInversionListAndStringList<'a>;
    #[inline]
    fn transform(&'a self) -> &'a Self::Output { self }
    #[inline]
    fn transform_owned(self) -> Self::Output { self }
    #[inline]
    unsafe fn make(this: Self::Output) -> Self {
        use core::{mem, ptr};
        if true {
            if !(mem::size_of::<Self::Output>() == mem::size_of::<Self>()) {
                ::core::panicking::panic("assertion failed: mem::size_of::<Self::Output>() == mem::size_of::<Self>()")
            };
        };
        let ptr: *const Self = (&this as *const Self::Output).cast();

        #[allow(forgetting_copy_types, clippy :: forget_copy, clippy ::
        forget_non_drop, clippy :: mem_forget)]
        mem::forget(this);
        ptr::read(ptr)
    }
    #[inline]
    fn transform_mut<F>(&'a mut self, f: F) where F: 'static +
        for<'b> FnOnce(&'b mut Self::Output) {
        unsafe {
            f(core::mem::transmute::<&'a mut Self,
                        &'a mut Self::Output>(self))
        }
    }
}Yokeable, impl<'zf, 'zf_inner>
    zerofrom::ZeroFrom<'zf, CodePointInversionListAndStringList<'zf_inner>>
    for CodePointInversionListAndStringList<'zf> where  {
    fn zero_from(this: &'zf CodePointInversionListAndStringList<'zf_inner>)
        -> Self {
        match *this {
            CodePointInversionListAndStringList {
                cp_inv_list: ref __binding_0, str_list: ref __binding_1 } => {
                CodePointInversionListAndStringList {
                    cp_inv_list: <CodePointInversionList<'zf> as
                            zerofrom::ZeroFrom<'zf,
                            CodePointInversionList<'zf_inner>>>::zero_from(__binding_0),
                    str_list: <VarZeroVec<'zf, str> as
                            zerofrom::ZeroFrom<'zf,
                            VarZeroVec<'zf_inner, str>>>::zero_from(__binding_1),
                }
            }
        }
    }
}ZeroFrom)]
32#[cfg_attr(not(feature = "alloc"), zerovec::skip_derive(ZeroMapKV, ToOwned))]
33// Valid to auto-derive Deserialize because the invariants are weakly held
34#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
35#[cfg_attr(feature = "serde", zerovec::derive(Serialize, Deserialize, Debug))]
36pub struct CodePointInversionListAndStringList<'data> {
37    #[cfg_attr(feature = "serde", serde(borrow))]
38    #[zerovec::varule(CodePointInversionListULE)]
39    cp_inv_list: CodePointInversionList<'data>,
40    // Invariants (weakly held):
41    //   - no input string is length 1 (a length 1 string should be a single code point)
42    //   - the string list is sorted
43    //   - the elements in the string list are unique
44    #[cfg_attr(feature = "serde", serde(borrow))]
45    str_list: VarZeroVec<'data, str>,
46}
47
48#[cfg(feature = "databake")]
49impl databake::Bake for CodePointInversionListAndStringList<'_> {
50    fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
51        env.insert("icu_collections");
52        let cp_inv_list = self.cp_inv_list.bake(env);
53        let str_list = self.str_list.bake(env);
54        // Safe because our parts are safe.
55        databake::quote! {
56            icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList::from_parts_unchecked(#cp_inv_list, #str_list)
57        }
58    }
59}
60
61#[cfg(feature = "databake")]
62impl databake::BakeSize for CodePointInversionListAndStringList<'_> {
63    fn borrows_size(&self) -> usize {
64        self.cp_inv_list.borrows_size() + self.str_list.borrows_size()
65    }
66}
67
68impl<'data> CodePointInversionListAndStringList<'data> {
69    /// Returns a new [`CodePointInversionListAndStringList`] from both a [`CodePointInversionList`] for the
70    /// code points and a [`VarZeroVec`]`<`[`str`]`>` of strings.
71    pub fn try_from(
72        cp_inv_list: CodePointInversionList<'data>,
73        str_list: VarZeroVec<'data, str>,
74    ) -> Result<Self, InvalidStringList> {
75        // Verify invariants:
76        // Do so by using the equivalent of str_list.iter().windows(2) to get
77        // overlapping windows of size 2. The above putative code is not possible
78        // because `.windows()` exists on a slice, but VarZeroVec cannot return a slice
79        // because the non-fixed size elements necessitate at least some type
80        // of allocation.
81        {
82            let mut it = str_list.iter();
83            if let Some(mut x) = it.next() {
84                if x.len() == 1 {
85                    return Err(InvalidStringList::InvalidStringLength(
86                        #[cfg(feature = "alloc")]
87                        x.to_string(),
88                    ));
89                }
90                for y in it {
91                    if x.len() == 1 {
92                        return Err(InvalidStringList::InvalidStringLength(
93                            #[cfg(feature = "alloc")]
94                            x.to_string(),
95                        ));
96                    } else if x == y {
97                        return Err(InvalidStringList::StringListNotUnique(
98                            #[cfg(feature = "alloc")]
99                            x.to_string(),
100                        ));
101                    } else if x > y {
102                        return Err(InvalidStringList::StringListNotSorted(
103                            #[cfg(feature = "alloc")]
104                            x.to_string(),
105                            #[cfg(feature = "alloc")]
106                            y.to_string(),
107                        ));
108                    }
109
110                    // Next window begins. Update `x` here, `y` will be updated in next loop iteration.
111                    x = y;
112                }
113            }
114        }
115
116        Ok(CodePointInversionListAndStringList {
117            cp_inv_list,
118            str_list,
119        })
120    }
121
122    #[doc(hidden)] // databake internal
123    pub const fn from_parts_unchecked(
124        cp_inv_list: CodePointInversionList<'data>,
125        str_list: VarZeroVec<'data, str>,
126    ) -> Self {
127        CodePointInversionListAndStringList {
128            cp_inv_list,
129            str_list,
130        }
131    }
132
133    /// Returns the number of elements in this set (its cardinality).
134    /// Note than the elements of a set may include both individual
135    /// codepoints and strings.
136    pub fn size(&self) -> usize {
137        self.cp_inv_list.size() + self.str_list.len()
138    }
139
140    /// Return true if this set contains multi-code point strings or the empty string.
141    pub fn has_strings(&self) -> bool {
142        !self.str_list.is_empty()
143    }
144
145    ///
146    /// # Examples
147    /// ```
148    /// use icu::collections::codepointinvlist::CodePointInversionList;
149    /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
150    /// use zerovec::VarZeroVec;
151    ///
152    /// let cp_slice = &[0, 0x1_0000, 0x10_FFFF, 0x11_0000];
153    /// let cp_list =
154    ///    CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
155    /// let str_slice = &["", "bmp_max", "unicode_max", "zero"];
156    /// let str_list = VarZeroVec::<str>::from(str_slice);
157    ///
158    /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
159    ///
160    /// assert!(cpilsl.contains_str("bmp_max"));
161    /// assert!(cpilsl.contains_str(""));
162    /// assert!(cpilsl.contains_str("A"));
163    /// assert!(cpilsl.contains_str("ቔ"));  // U+1254 ETHIOPIC SYLLABLE QHEE
164    /// assert!(!cpilsl.contains_str("bazinga!"));
165    /// ```
166    pub fn contains_str(&self, s: &str) -> bool {
167        let mut chars = s.chars();
168        if let Some(first_char) = chars.next() {
169            if chars.next().is_none() {
170                return self.contains(first_char);
171            }
172        }
173        self.str_list.binary_search(s).is_ok()
174    }
175
176    ///
177    /// # Examples
178    /// ```
179    /// use icu::collections::codepointinvlist::CodePointInversionList;
180    /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
181    /// use zerovec::VarZeroVec;
182    ///
183    /// let cp_slice = &[0, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
184    /// let cp_list =
185    ///     CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
186    /// let str_slice = &["", "ascii_max", "bmp_max", "unicode_max", "zero"];
187    /// let str_list = VarZeroVec::<str>::from(str_slice);
188    ///
189    /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
190    ///
191    /// assert!(cpilsl.contains32(0));
192    /// assert!(cpilsl.contains32(0x0042));
193    /// assert!(!cpilsl.contains32(0x0080));
194    /// ```
195    pub fn contains32(&self, cp: u32) -> bool {
196        self.cp_inv_list.contains32(cp)
197    }
198
199    ///
200    /// # Examples
201    /// ```
202    /// use icu::collections::codepointinvlist::CodePointInversionList;
203    /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
204    /// use zerovec::VarZeroVec;
205    ///
206    /// let cp_slice = &[0, 0x1_0000, 0x10_FFFF, 0x11_0000];
207    /// let cp_list =
208    ///    CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
209    /// let str_slice = &["", "bmp_max", "unicode_max", "zero"];
210    /// let str_list = VarZeroVec::<str>::from(str_slice);
211    ///
212    /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
213    ///
214    /// assert!(cpilsl.contains('A'));
215    /// assert!(cpilsl.contains('ቔ'));  // U+1254 ETHIOPIC SYLLABLE QHEE
216    /// assert!(!cpilsl.contains('\u{1_0000}'));
217    /// assert!(!cpilsl.contains('🨫'));  // U+1FA2B NEUTRAL CHESS TURNED QUEEN
218    pub fn contains(&self, ch: char) -> bool {
219        self.contains32(ch as u32)
220    }
221
222    /// Access the underlying [`CodePointInversionList`].
223    pub fn code_points(&self) -> &CodePointInversionList<'data> {
224        &self.cp_inv_list
225    }
226
227    /// Access the contained strings.
228    pub fn strings(&self) -> &VarZeroSlice<str> {
229        &self.str_list
230    }
231}
232
233#[cfg(feature = "alloc")]
234/// ✨ *Enabled with the `alloc` Cargo feature.*
235impl<'a> FromIterator<&'a str> for CodePointInversionListAndStringList<'_> {
236    fn from_iter<I>(it: I) -> Self
237    where
238        I: IntoIterator<Item = &'a str>,
239    {
240        let mut builder = CodePointInversionListBuilder::new();
241        let mut strings = Vec::<&str>::new();
242        for s in it {
243            let mut chars = s.chars();
244            if let Some(first_char) = chars.next() {
245                if chars.next().is_none() {
246                    builder.add_char(first_char);
247                    continue;
248                }
249            }
250            strings.push(s);
251        }
252
253        // Ensure that the string list is sorted. If not, the binary search that
254        // is used for `.contains(&str)` will return garbage output.
255        strings.sort_unstable();
256        strings.dedup();
257
258        let cp_inv_list = builder.build();
259        let str_list = VarZeroVec::<str>::from(&strings);
260
261        CodePointInversionListAndStringList {
262            cp_inv_list,
263            str_list,
264        }
265    }
266}
267
268/// Custom Errors for [`CodePointInversionListAndStringList`].
269#[derive(#[allow(non_upper_case_globals, unused_attributes, unused_qualifications)]
const _: () =
    {
        impl ::core::fmt::Display for InvalidStringList {
            fn fmt(&self, formatter: &mut ::core::fmt::Formatter)
                -> ::core::fmt::Result {

                #[allow(unused_variables)]
                match self {
                    Self::InvalidStringLength() => {
                        formatter.write_fmt(format_args!("A string in the string list had an invalid length"))
                    }
                    Self::StringListNotUnique() => {
                        formatter.write_fmt(format_args!("A string in the string list appears more than once"))
                    }
                    Self::StringListNotSorted() => {
                        formatter.write_fmt(format_args!("Two strings in the string list compare to each other opposite of sorted order"))
                    }
                }
            }
        }
    };Display, #[automatically_derived]
impl ::core::fmt::Debug for InvalidStringList {
    #[inline]
    fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
        ::core::fmt::Formatter::write_str(f,
            match self {
                InvalidStringList::InvalidStringLength() =>
                    "InvalidStringLength",
                InvalidStringList::StringListNotUnique() =>
                    "StringListNotUnique",
                InvalidStringList::StringListNotSorted() =>
                    "StringListNotSorted",
            })
    }
}Debug)]
270pub enum InvalidStringList {
271    /// A string in the string list had an invalid length
272    #[cfg_attr(feature = "alloc", displaydoc("Invalid string length for string: {0}"))]
273    InvalidStringLength(#[cfg(feature = "alloc")] String),
274    /// A string in the string list appears more than once
275    #[cfg_attr(feature = "alloc", displaydoc("String list has duplicate: {0}"))]
276    StringListNotUnique(#[cfg(feature = "alloc")] String),
277    /// Two strings in the string list compare to each other opposite of sorted order
278    #[cfg_attr(
279        feature = "alloc",
280        displaydoc("Strings in string list not in sorted order: ({0}, {1})")
281    )]
282    StringListNotSorted(
283        #[cfg(feature = "alloc")] String,
284        #[cfg(feature = "alloc")] String,
285    ),
286}
287
288#[cfg(test)]
289mod tests {
290    use super::*;
291
292    #[test]
293    fn test_size_has_strings() {
294        let cp_slice = &[0, 1, 0x7F, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
295        let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
296        let str_slice = &["ascii_max", "bmp_max", "unicode_max", "zero"];
297        let str_list = VarZeroVec::<str>::from(str_slice);
298
299        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
300
301        assert!(cpilsl.has_strings());
302        assert_eq!(8, cpilsl.size());
303    }
304
305    #[test]
306    fn test_empty_string_allowed() {
307        let cp_slice = &[0, 1, 0x7F, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
308        let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
309        let str_slice = &["", "ascii_max", "bmp_max", "unicode_max", "zero"];
310        let str_list = VarZeroVec::<str>::from(str_slice);
311
312        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
313
314        assert!(cpilsl.has_strings());
315        assert_eq!(9, cpilsl.size());
316    }
317
318    #[test]
319    fn test_invalid_string() {
320        let cp_slice = &[0, 1];
321        let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
322        let str_slice = &["a"];
323        let str_list = VarZeroVec::<str>::from(str_slice);
324
325        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
326
327        assert!(matches!(
328            cpilsl,
329            Err(InvalidStringList::InvalidStringLength(_))
330        ));
331    }
332
333    #[test]
334    fn test_invalid_string_list_has_duplicate() {
335        let cp_slice = &[0, 1];
336        let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
337        let str_slice = &["abc", "abc"];
338        let str_list = VarZeroVec::<str>::from(str_slice);
339
340        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
341
342        assert!(matches!(
343            cpilsl,
344            Err(InvalidStringList::StringListNotUnique(_))
345        ));
346    }
347
348    #[test]
349    fn test_invalid_string_list_not_sorted() {
350        let cp_slice = &[0, 1];
351        let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
352        let str_slice = &["xyz", "abc"];
353        let str_list = VarZeroVec::<str>::from(str_slice);
354
355        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
356
357        assert!(matches!(
358            cpilsl,
359            Err(InvalidStringList::StringListNotSorted(_, _))
360        ));
361    }
362
363    #[test]
364    fn test_from_iter_invariants() {
365        let in_strs_1 = ["a", "abc", "xyz", "abc"];
366        let in_strs_2 = ["xyz", "abc", "a", "abc"];
367
368        let cpilsl_1 = CodePointInversionListAndStringList::from_iter(in_strs_1);
369        let cpilsl_2 = CodePointInversionListAndStringList::from_iter(in_strs_2);
370
371        assert_eq!(cpilsl_1, cpilsl_2);
372
373        assert!(cpilsl_1.has_strings());
374        assert!(cpilsl_1.contains_str("abc"));
375        assert!(cpilsl_1.contains_str("xyz"));
376        assert!(!cpilsl_1.contains_str("def"));
377
378        assert_eq!(1, cpilsl_1.cp_inv_list.size());
379        assert!(cpilsl_1.contains('a'));
380        assert!(!cpilsl_1.contains('0'));
381        assert!(!cpilsl_1.contains('q'));
382
383        assert_eq!(3, cpilsl_1.size());
384    }
385}