icu_collections/codepointinvliststringlist/
mod.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! This module provides functionality for querying of sets of Unicode code points and strings.
6//!
7//! It depends on [`CodePointInversionList`] to efficiently represent Unicode code points, while
8//! it also maintains a list of strings in the set.
9//!
10//! It is an implementation of the existing [ICU4C UnicodeSet API](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UnicodeSet.html).
11
12use crate::codepointinvlist::{
13    CodePointInversionList, CodePointInversionListBuilder, CodePointInversionListError,
14    CodePointInversionListULE,
15};
16use alloc::string::{String, ToString};
17use alloc::vec::Vec;
18use displaydoc::Display;
19use yoke::Yokeable;
20use zerofrom::ZeroFrom;
21use zerovec::{VarZeroSlice, VarZeroVec};
22
23/// A data structure providing a concrete implementation of a `UnicodeSet`
24/// (which represents a set of code points and strings) using an inversion list for the code points and a simple
25/// list-like structure to store and iterate over the strings.
26#[zerovec::make_varule(CodePointInversionListAndStringListULE)]
27#[zerovec::skip_derive(Ord)]
28#[zerovec::derive(Debug)]
29#[derive(Debug, Eq, PartialEq, Clone, Yokeable, ZeroFrom)]
30// Valid to auto-derive Deserialize because the invariants are weakly held
31#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
32#[cfg_attr(feature = "serde", zerovec::derive(Serialize, Deserialize, Debug))]
33pub struct CodePointInversionListAndStringList<'data> {
34    #[cfg_attr(feature = "serde", serde(borrow))]
35    #[zerovec::varule(CodePointInversionListULE)]
36    cp_inv_list: CodePointInversionList<'data>,
37    // Invariants (weakly held):
38    //   - no input string is length 1 (a length 1 string should be a single code point)
39    //   - the string list is sorted
40    //   - the elements in the string list are unique
41    #[cfg_attr(feature = "serde", serde(borrow))]
42    str_list: VarZeroVec<'data, str>,
43}
44
45#[cfg(feature = "databake")]
46impl databake::Bake for CodePointInversionListAndStringList<'_> {
47    fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
48        env.insert("icu_collections");
49        let cp_inv_list = self.cp_inv_list.bake(env);
50        let str_list = self.str_list.bake(env);
51        // Safe because our parts are safe.
52        databake::quote! {
53            icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList::from_parts_unchecked(#cp_inv_list, #str_list)
54        }
55    }
56}
57
58impl<'data> CodePointInversionListAndStringList<'data> {
59    /// Returns a new [`CodePointInversionListAndStringList`] from both a [`CodePointInversionList`] for the
60    /// code points and a [`VarZeroVec`]`<`[`str`]`>` of strings.
61    pub fn try_from(
62        cp_inv_list: CodePointInversionList<'data>,
63        str_list: VarZeroVec<'data, str>,
64    ) -> Result<Self, CodePointInversionListAndStringListError> {
65        // Verify invariants:
66        // Do so by using the equivalent of str_list.iter().windows(2) to get
67        // overlapping windows of size 2. The above putative code is not possible
68        // because `.windows()` exists on a slice, but VarZeroVec cannot return a slice
69        // because the non-fixed size elements necessitate at least some type
70        // of allocation.
71        {
72            let mut it = str_list.iter();
73            if let Some(mut x) = it.next() {
74                if x.len() == 1 {
75                    return Err(
76                        CodePointInversionListAndStringListError::InvalidStringLength(
77                            x.to_string(),
78                        ),
79                    );
80                }
81                for y in it {
82                    if x.len() == 1 {
83                        return Err(
84                            CodePointInversionListAndStringListError::InvalidStringLength(
85                                x.to_string(),
86                            ),
87                        );
88                    } else if x == y {
89                        return Err(
90                            CodePointInversionListAndStringListError::StringListNotUnique(
91                                x.to_string(),
92                            ),
93                        );
94                    } else if x > y {
95                        return Err(
96                            CodePointInversionListAndStringListError::StringListNotSorted(
97                                x.to_string(),
98                                y.to_string(),
99                            ),
100                        );
101                    }
102
103                    // Next window begins. Update `x` here, `y` will be updated in next loop iteration.
104                    x = y;
105                }
106            }
107        }
108
109        Ok(CodePointInversionListAndStringList {
110            cp_inv_list,
111            str_list,
112        })
113    }
114
115    #[doc(hidden)]
116    pub const fn from_parts_unchecked(
117        cp_inv_list: CodePointInversionList<'data>,
118        str_list: VarZeroVec<'data, str>,
119    ) -> Self {
120        CodePointInversionListAndStringList {
121            cp_inv_list,
122            str_list,
123        }
124    }
125
126    /// Returns the number of elements in this set (its cardinality).
127    /// Note than the elements of a set may include both individual
128    /// codepoints and strings.
129    pub fn size(&self) -> usize {
130        self.cp_inv_list.size() + self.str_list.len()
131    }
132
133    /// Return true if this set contains multi-code point strings or the empty string.
134    pub fn has_strings(&self) -> bool {
135        !self.str_list.is_empty()
136    }
137
138    ///
139    /// # Examples
140    /// ```
141    /// use icu::collections::codepointinvlist::CodePointInversionList;
142    /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
143    /// use zerovec::VarZeroVec;
144    ///
145    /// let cp_slice = &[0, 0x1_0000, 0x10_FFFF, 0x11_0000];
146    /// let cp_list =
147    ///    CodePointInversionList::try_clone_from_inversion_list_slice(cp_slice).unwrap();
148    /// let str_slice = &["", "bmp_max", "unicode_max", "zero"];
149    /// let str_list = VarZeroVec::<str>::from(str_slice);
150    ///
151    /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
152    ///
153    /// assert!(cpilsl.contains("bmp_max"));
154    /// assert!(cpilsl.contains(""));
155    /// assert!(cpilsl.contains("A"));
156    /// assert!(cpilsl.contains("ቔ"));  // U+1254 ETHIOPIC SYLLABLE QHEE
157    /// assert!(!cpilsl.contains("bazinga!"));
158    /// ```
159    pub fn contains(&self, s: &str) -> bool {
160        let mut chars = s.chars();
161        if let Some(first_char) = chars.next() {
162            if chars.next().is_none() {
163                return self.contains_char(first_char);
164            }
165        }
166        self.str_list.binary_search(s).is_ok()
167    }
168
169    ///
170    /// # Examples
171    /// ```
172    /// use icu::collections::codepointinvlist::CodePointInversionList;
173    /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
174    /// use zerovec::VarZeroVec;
175    ///
176    /// let cp_slice = &[0, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
177    /// let cp_list =
178    ///     CodePointInversionList::try_clone_from_inversion_list_slice(cp_slice).unwrap();
179    /// let str_slice = &["", "ascii_max", "bmp_max", "unicode_max", "zero"];
180    /// let str_list = VarZeroVec::<str>::from(str_slice);
181    ///
182    /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
183    ///
184    /// assert!(cpilsl.contains32(0));
185    /// assert!(cpilsl.contains32(0x0042));
186    /// assert!(!cpilsl.contains32(0x0080));
187    /// ```
188    pub fn contains32(&self, cp: u32) -> bool {
189        self.cp_inv_list.contains32(cp)
190    }
191
192    ///
193    /// # Examples
194    /// ```
195    /// use icu::collections::codepointinvlist::CodePointInversionList;
196    /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
197    /// use zerovec::VarZeroVec;
198    ///
199    /// let cp_slice = &[0, 0x1_0000, 0x10_FFFF, 0x11_0000];
200    /// let cp_list =
201    ///    CodePointInversionList::try_clone_from_inversion_list_slice(cp_slice).unwrap();
202    /// let str_slice = &["", "bmp_max", "unicode_max", "zero"];
203    /// let str_list = VarZeroVec::<str>::from(str_slice);
204    ///
205    /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
206    ///
207    /// assert!(cpilsl.contains_char('A'));
208    /// assert!(cpilsl.contains_char('ቔ'));  // U+1254 ETHIOPIC SYLLABLE QHEE
209    /// assert!(!cpilsl.contains_char('\u{1_0000}'));
210    /// assert!(!cpilsl.contains_char('🨫'));  // U+1FA2B NEUTRAL CHESS TURNED QUEEN
211    pub fn contains_char(&self, ch: char) -> bool {
212        self.contains32(ch as u32)
213    }
214
215    /// Access the underlying [`CodePointInversionList`].
216    pub fn code_points(&self) -> &CodePointInversionList<'data> {
217        &self.cp_inv_list
218    }
219
220    /// Access the contained strings.
221    pub fn strings(&self) -> &VarZeroSlice<str> {
222        &self.str_list
223    }
224}
225
226impl<'a> FromIterator<&'a str> for CodePointInversionListAndStringList<'_> {
227    fn from_iter<I>(it: I) -> Self
228    where
229        I: IntoIterator<Item = &'a str>,
230    {
231        let mut builder = CodePointInversionListBuilder::new();
232        let mut strings = Vec::<&str>::new();
233        for s in it {
234            let mut chars = s.chars();
235            if let Some(first_char) = chars.next() {
236                if chars.next().is_none() {
237                    builder.add_char(first_char);
238                    continue;
239                }
240            }
241            strings.push(s);
242        }
243
244        // Ensure that the string list is sorted. If not, the binary search that
245        // is used for `.contains(&str)` will return garbase otuput.
246        strings.sort_unstable();
247        strings.dedup();
248
249        let cp_inv_list = builder.build();
250        let str_list = VarZeroVec::<str>::from(&strings);
251
252        CodePointInversionListAndStringList {
253            cp_inv_list,
254            str_list,
255        }
256    }
257}
258
259/// Custom Errors for [`CodePointInversionListAndStringList`].
260///
261/// Re-exported as [`Error`].
262#[derive(Display, Debug)]
263pub enum CodePointInversionListAndStringListError {
264    /// An invalid CodePointInversionList was constructed
265    #[displaydoc("Invalid code point inversion list: {0:?}")]
266    InvalidCodePointInversionList(CodePointInversionListError),
267    /// A string in the string list had an invalid length
268    #[displaydoc("Invalid string length for string: {0}")]
269    InvalidStringLength(String),
270    /// A string in the string list appears more than once
271    #[displaydoc("String list has duplicate: {0}")]
272    StringListNotUnique(String),
273    /// Two strings in the string list compare to each other opposite of sorted order
274    #[displaydoc("Strings in string list not in sorted order: ({0}, {1})")]
275    StringListNotSorted(String, String),
276}
277
278#[doc(no_inline)]
279pub use CodePointInversionListAndStringListError as Error;
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284
285    #[test]
286    fn test_size_has_strings() {
287        let cp_slice = &[0, 1, 0x7F, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
288        let cp_list =
289            CodePointInversionList::try_clone_from_inversion_list_slice(cp_slice).unwrap();
290        let str_slice = &["ascii_max", "bmp_max", "unicode_max", "zero"];
291        let str_list = VarZeroVec::<str>::from(str_slice);
292
293        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
294
295        assert!(cpilsl.has_strings());
296        assert_eq!(8, cpilsl.size());
297    }
298
299    #[test]
300    fn test_empty_string_allowed() {
301        let cp_slice = &[0, 1, 0x7F, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
302        let cp_list =
303            CodePointInversionList::try_clone_from_inversion_list_slice(cp_slice).unwrap();
304        let str_slice = &["", "ascii_max", "bmp_max", "unicode_max", "zero"];
305        let str_list = VarZeroVec::<str>::from(str_slice);
306
307        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
308
309        assert!(cpilsl.has_strings());
310        assert_eq!(9, cpilsl.size());
311    }
312
313    #[test]
314    fn test_invalid_string() {
315        let cp_slice = &[0, 1];
316        let cp_list =
317            CodePointInversionList::try_clone_from_inversion_list_slice(cp_slice).unwrap();
318        let str_slice = &["a"];
319        let str_list = VarZeroVec::<str>::from(str_slice);
320
321        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
322
323        assert!(matches!(
324            cpilsl,
325            Err(CodePointInversionListAndStringListError::InvalidStringLength(_))
326        ));
327    }
328
329    #[test]
330    fn test_invalid_string_list_has_duplicate() {
331        let cp_slice = &[0, 1];
332        let cp_list =
333            CodePointInversionList::try_clone_from_inversion_list_slice(cp_slice).unwrap();
334        let str_slice = &["abc", "abc"];
335        let str_list = VarZeroVec::<str>::from(str_slice);
336
337        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
338
339        assert!(matches!(
340            cpilsl,
341            Err(CodePointInversionListAndStringListError::StringListNotUnique(_))
342        ));
343    }
344
345    #[test]
346    fn test_invalid_string_list_not_sorted() {
347        let cp_slice = &[0, 1];
348        let cp_list =
349            CodePointInversionList::try_clone_from_inversion_list_slice(cp_slice).unwrap();
350        let str_slice = &["xyz", "abc"];
351        let str_list = VarZeroVec::<str>::from(str_slice);
352
353        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
354
355        assert!(matches!(
356            cpilsl,
357            Err(CodePointInversionListAndStringListError::StringListNotSorted(_, _))
358        ));
359    }
360
361    #[test]
362    fn test_from_iter_invariants() {
363        let in_strs_1 = ["a", "abc", "xyz", "abc"];
364        let in_strs_2 = ["xyz", "abc", "a", "abc"];
365
366        let cpilsl_1 = CodePointInversionListAndStringList::from_iter(in_strs_1);
367        let cpilsl_2 = CodePointInversionListAndStringList::from_iter(in_strs_2);
368
369        assert_eq!(cpilsl_1, cpilsl_2);
370
371        assert!(cpilsl_1.has_strings());
372        assert!(cpilsl_1.contains("abc"));
373        assert!(cpilsl_1.contains("xyz"));
374        assert!(!cpilsl_1.contains("def"));
375
376        assert_eq!(1, cpilsl_1.cp_inv_list.size());
377        assert!(cpilsl_1.contains_char('a'));
378        assert!(!cpilsl_1.contains_char('0'));
379        assert!(!cpilsl_1.contains_char('q'));
380
381        assert_eq!(3, cpilsl_1.size());
382    }
383}