icu_collections/codepointtrie/cptrie.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::codepointtrie::error::Error;
6use crate::codepointtrie::impl_const::*;
7
8use crate::codepointinvlist::CodePointInversionList;
9use core::char::CharTryFromError;
10use core::convert::Infallible;
11use core::convert::TryFrom;
12use core::fmt::Display;
13use core::iter::FromIterator;
14use core::num::TryFromIntError;
15use core::ops::RangeInclusive;
16use yoke::Yokeable;
17use zerofrom::ZeroFrom;
18use zerovec::ZeroVec;
19use zerovec::ZeroVecError;
20
21/// The type of trie represents whether the trie has an optimization that
22/// would make it smaller or faster.
23///
24/// Regarding performance, a trie being a small or fast type affects the number of array lookups
25/// needed for code points in the range `[0x1000, 0x10000)`. In this range, `Small` tries use 4 array lookups,
26/// while `Fast` tries use 2 array lookups.
27/// Code points before the interval (in `[0, 0x1000)`) will always use 2 array lookups.
28/// Code points after the interval (in `[0x10000, 0x10FFFF]`) will always use 4 array lookups.
29///
30/// Regarding size, `Fast` type tries are larger than `Small` type tries because the minimum size of
31/// the index array is larger. The minimum size is the "fast max" limit, which is the limit of the range
32/// of code points with 2 array lookups.
33///
34/// See the document [Unicode Properties and Code Point Tries in ICU4X](https://github.com/unicode-org/icu4x/blob/main/documents/design/properties_code_point_trie.md).
35///
36/// Also see [`UCPTrieType`](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucptrie_8h.html) in ICU4C.
37#[derive(Clone, Copy, PartialEq, Debug, Eq)]
38#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
39#[cfg_attr(feature = "databake", derive(databake::Bake), databake(path = icu_collections::codepointtrie))]
40pub enum TrieType {
41 /// Represents the "fast" type code point tries for the
42 /// [`TrieType`] trait. The "fast max" limit is set to `0xffff`.
43 Fast = 0,
44 /// Represents the "small" type code point tries for the
45 /// [`TrieType`] trait. The "fast max" limit is set to `0x0fff`.
46 Small = 1,
47}
48
49// TrieValue trait
50
51// AsULE is AsUnalignedLittleEndian, i.e. "allowed in a zerovec"
52
53/// A trait representing the values stored in the data array of a [`CodePointTrie`].
54/// This trait is used as a type parameter in constructing a `CodePointTrie`.
55pub trait TrieValue: Copy + Eq + PartialEq + zerovec::ule::AsULE + 'static {
56 /// Last-resort fallback value to return if we cannot read data from the trie.
57 ///
58 /// In most cases, the error value is read from the last element of the `data` array,
59 /// this value is used for empty codepointtrie arrays
60
61 /// Error type when converting from a u32 to this `TrieValue`.
62 type TryFromU32Error: Display;
63 /// A parsing function that is primarily motivated by deserialization contexts.
64 /// When the serialization type width is smaller than 32 bits, then it is expected
65 /// that the call site will widen the value to a `u32` first.
66 fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error>;
67
68 /// A method for converting back to a `u32` that can roundtrip through
69 /// [`Self::try_from_u32()`]. The default implementation of this trait
70 /// method panics in debug mode and returns 0 in release mode.
71 ///
72 /// This method is allowed to have GIGO behavior when fed a value that has
73 /// no corresponding `u32` (since such values cannot be stored in the trie)
74 fn to_u32(self) -> u32 {
75 debug_assert!(
76 false,
77 "TrieValue::to_u32() not implemented for {}",
78 ::core::any::type_name::<Self>()
79 );
80 0
81 }
82}
83
84macro_rules! impl_primitive_trie_value {
85 ($primitive:ty, $error:ty) => {
86 impl TrieValue for $primitive {
87 type TryFromU32Error = $error;
88 fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
89 Self::try_from(i)
90 }
91
92 fn to_u32(self) -> u32 {
93 // bitcast when the same size, zero-extend/sign-extend
94 // when not the same size
95 self as u32
96 }
97 }
98 };
99}
100
101impl_primitive_trie_value!(u8, TryFromIntError);
102impl_primitive_trie_value!(u16, TryFromIntError);
103impl_primitive_trie_value!(u32, Infallible);
104impl_primitive_trie_value!(i8, TryFromIntError);
105impl_primitive_trie_value!(i16, TryFromIntError);
106impl_primitive_trie_value!(i32, TryFromIntError);
107impl_primitive_trie_value!(char, CharTryFromError);
108
109/// Helper function used by [`get_range`]. Converts occurrences of trie's null
110/// value into the provided `null_value`.
111///
112/// Note: the ICU version of this helper function uses a `ValueFilter` function
113/// to apply a transform on a non-null value. But currently, this implementation
114/// stops short of that functionality, and instead leaves the non-null trie value
115/// untouched. This is equivalent to having a `ValueFilter` function that is the
116/// identity function.
117fn maybe_filter_value<T: TrieValue>(value: T, trie_null_value: T, null_value: T) -> T {
118 if value == trie_null_value {
119 null_value
120 } else {
121 value
122 }
123}
124
125/// This struct represents a de-serialized [`CodePointTrie`] that was exported from
126/// ICU binary data.
127///
128/// For more information:
129/// - [ICU Site design doc](http://site.icu-project.org/design/struct/utrie)
130/// - [ICU User Guide section on Properties lookup](https://unicode-org.github.io/icu/userguide/strings/properties.html#lookup)
131// serde impls in crate::serde
132#[derive(Debug, Eq, PartialEq, Yokeable, ZeroFrom)]
133pub struct CodePointTrie<'trie, T: TrieValue> {
134 pub(crate) header: CodePointTrieHeader,
135 pub(crate) index: ZeroVec<'trie, u16>,
136 pub(crate) data: ZeroVec<'trie, T>,
137 // serde impl skips this field
138 #[zerofrom(clone)] // TrieValue is Copy, this allows us to avoid
139 // a T: ZeroFrom bound
140 pub(crate) error_value: T,
141}
142
143/// This struct contains the fixed-length header fields of a [`CodePointTrie`].
144#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
145#[cfg_attr(feature = "databake", derive(databake::Bake), databake(path = icu_collections::codepointtrie))]
146#[derive(Copy, Clone, Debug, Eq, PartialEq, Yokeable, ZeroFrom)]
147pub struct CodePointTrieHeader {
148 /// The code point of the start of the last range of the trie. A
149 /// range is defined as a partition of the code point space such that the
150 /// value in this trie associated with all code points of the same range is
151 /// the same.
152 ///
153 /// For the property value data for many Unicode properties,
154 /// often times, `high_start` is `U+10000` or lower. In such cases, not
155 /// reserving space in the `index` array for duplicate values is a large
156 /// savings. The "highValue" associated with the `high_start` range is
157 /// stored at the second-to-last position of the `data` array.
158 /// (See `impl_const::HIGH_VALUE_NEG_DATA_OFFSET`.)
159 pub high_start: u32,
160 /// A version of the `high_start` value that is right-shifted 12 spaces,
161 /// but is rounded up to a multiple `0x1000` for easy testing from UTF-8
162 /// lead bytes.
163 pub shifted12_high_start: u16,
164 /// Offset for the null block in the "index-3" table of the `index` array.
165 /// Set to an impossibly high value (e.g., `0xffff`) if there is no
166 /// dedicated index-3 null block.
167 pub index3_null_offset: u16,
168 /// Internal data null block offset, not shifted.
169 /// Set to an impossibly high value (e.g., `0xfffff`) if there is no
170 /// dedicated data null block.
171 pub data_null_offset: u32,
172 /// The value stored in the trie that represents a null value being
173 /// associated to a code point.
174 pub null_value: u32,
175 /// The enum value representing the type of trie, where trie type is as it
176 /// is defined in ICU (ex: Fast, Small).
177 pub trie_type: TrieType,
178}
179
180impl TryFrom<u8> for TrieType {
181 type Error = crate::codepointtrie::error::Error;
182
183 fn try_from(trie_type_int: u8) -> Result<TrieType, crate::codepointtrie::error::Error> {
184 match trie_type_int {
185 0 => Ok(TrieType::Fast),
186 1 => Ok(TrieType::Small),
187 _ => Err(crate::codepointtrie::error::Error::FromDeserialized {
188 reason: "Cannot parse value for trie_type",
189 }),
190 }
191 }
192}
193
194impl<'trie, T: TrieValue> CodePointTrie<'trie, T> {
195 #[doc(hidden)] // databake internal
196 pub const fn from_parts(
197 header: CodePointTrieHeader,
198 index: ZeroVec<'trie, u16>,
199 data: ZeroVec<'trie, T>,
200 error_value: T,
201 ) -> Self {
202 Self {
203 header,
204 index,
205 data,
206 error_value,
207 }
208 }
209
210 /// Returns a new [`CodePointTrie`] backed by borrowed data for the `index`
211 /// array and `data` array, whose data values have width `W`.
212 pub fn try_new(
213 header: CodePointTrieHeader,
214 index: ZeroVec<'trie, u16>,
215 data: ZeroVec<'trie, T>,
216 ) -> Result<CodePointTrie<'trie, T>, Error> {
217 // Validation invariants are not needed here when constructing a new
218 // `CodePointTrie` because:
219 //
220 // - Rust includes the size of a slice (or Vec or similar), which allows it
221 // to prevent lookups at out-of-bounds indices, whereas in C++, it is the
222 // programmer's responsibility to keep track of length info.
223 // - For lookups into collections, Rust guarantees that a fallback value will
224 // be returned in the case of `.get()` encountering a lookup error, via
225 // the `Option` type.
226 // - The `ZeroVec` serializer stores the length of the array along with the
227 // ZeroVec data, meaning that a deserializer would also see that length info.
228
229 let error_value = data.last().ok_or(Error::EmptyDataVector)?;
230 let trie: CodePointTrie<'trie, T> = CodePointTrie {
231 header,
232 index,
233 data,
234 error_value,
235 };
236 Ok(trie)
237 }
238
239 /// Returns the position in the data array containing the trie's stored
240 /// error value.
241 #[inline(always)] // `always` based on normalizer benchmarking
242 fn trie_error_val_index(&self) -> u32 {
243 self.data.len() as u32 - ERROR_VALUE_NEG_DATA_OFFSET
244 }
245
246 fn internal_small_index(&self, code_point: u32) -> u32 {
247 let mut index1_pos: u32 = code_point >> SHIFT_1;
248 if self.header.trie_type == TrieType::Fast {
249 debug_assert!(
250 FAST_TYPE_FAST_INDEXING_MAX < code_point && code_point < self.header.high_start
251 );
252 index1_pos = index1_pos + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH;
253 } else {
254 assert!(code_point < self.header.high_start && self.header.high_start > SMALL_LIMIT);
255 index1_pos += SMALL_INDEX_LENGTH;
256 }
257 let index1_val = if let Some(index1_val) = self.index.get(index1_pos as usize) {
258 index1_val
259 } else {
260 return self.trie_error_val_index();
261 };
262 let index3_block_idx: u32 = (index1_val as u32) + ((code_point >> SHIFT_2) & INDEX_2_MASK);
263 let mut index3_block: u32 =
264 if let Some(index3_block) = self.index.get(index3_block_idx as usize) {
265 index3_block as u32
266 } else {
267 return self.trie_error_val_index();
268 };
269 let mut index3_pos: u32 = (code_point >> SHIFT_3) & INDEX_3_MASK;
270 let mut data_block: u32;
271 if index3_block & 0x8000 == 0 {
272 // 16-bit indexes
273 data_block =
274 if let Some(data_block) = self.index.get((index3_block + index3_pos) as usize) {
275 data_block as u32
276 } else {
277 return self.trie_error_val_index();
278 };
279 } else {
280 // 18-bit indexes stored in groups of 9 entries per 8 indexes.
281 index3_block = (index3_block & 0x7fff) + (index3_pos & !7) + (index3_pos >> 3);
282 index3_pos &= 7;
283 data_block = if let Some(data_block) = self.index.get(index3_block as usize) {
284 data_block as u32
285 } else {
286 return self.trie_error_val_index();
287 };
288 data_block = (data_block << (2 + (2 * index3_pos))) & 0x30000;
289 index3_block += 1;
290 data_block =
291 if let Some(index3_val) = self.index.get((index3_block + index3_pos) as usize) {
292 data_block | (index3_val as u32)
293 } else {
294 return self.trie_error_val_index();
295 };
296 }
297 // Returns data_pos == data_block (offset) +
298 // portion of code_point bit field for last (4th) lookup
299 data_block + (code_point & SMALL_DATA_MASK)
300 }
301
302 /// Returns the position in the `data` array for the given code point,
303 /// where this code point is at or above the fast limit associated for the
304 /// `trie_type`. We will refer to that limit as "`fastMax`" here.
305 ///
306 /// A lookup of the value in the code point trie for a code point in the
307 /// code point space range [`fastMax`, `high_start`) will be a 4-step
308 /// lookup: 3 lookups in the `index` array and one lookup in the `data`
309 /// array. Lookups for code points in the range [`high_start`,
310 /// `CODE_POINT_MAX`] are short-circuited to be a single lookup, see
311 /// [`CodePointTrieHeader::high_start`].
312 fn small_index(&self, code_point: u32) -> u32 {
313 if code_point >= self.header.high_start {
314 self.data.len() as u32 - HIGH_VALUE_NEG_DATA_OFFSET
315 } else {
316 self.internal_small_index(code_point) // helper fn
317 }
318 }
319
320 /// Returns the position in the `data` array for the given code point,
321 /// where this code point is below the fast limit associated for the
322 /// `trie type`. We will refer to that limit as "`fastMax`" here.
323 ///
324 /// A lookup of the value in the code point trie for a code point in the
325 /// code point space range [0, `fastMax`) will be a 2-step lookup: 1
326 /// lookup in the `index` array and one lookup in the `data` array. By
327 /// design, for trie type `T`, there is an element allocated in the `index`
328 /// array for each block of code points in [0, `fastMax`), which in
329 /// turn guarantees that those code points are represented and only need 1
330 /// lookup.
331 #[inline(always)] // `always` based on normalizer benchmarking
332 fn fast_index(&self, code_point: u32) -> u32 {
333 let index_array_pos: u32 = code_point >> FAST_TYPE_SHIFT;
334 let index_array_val: u16 =
335 if let Some(index_array_val) = self.index.get(index_array_pos as usize) {
336 index_array_val
337 } else {
338 return self.trie_error_val_index();
339 };
340 let fast_index_val: u32 = index_array_val as u32 + (code_point & FAST_TYPE_DATA_MASK);
341 fast_index_val
342 }
343
344 /// Returns the value that is associated with `code_point` in this [`CodePointTrie`].
345 ///
346 /// # Examples
347 ///
348 /// ```
349 /// use icu::collections::codepointtrie::planes;
350 /// let trie = planes::get_planes_trie();
351 ///
352 /// assert_eq!(0, trie.get32(0x41)); // 'A' as u32
353 /// assert_eq!(0, trie.get32(0x13E0)); // 'Ꮰ' as u32
354 /// assert_eq!(1, trie.get32(0x10044)); // '𐁄' as u32
355 /// ```
356 #[inline(always)] // `always` based on normalizer benchmarking
357 pub fn get32(&self, code_point: u32) -> T {
358 // If we cannot read from the data array, then return the sentinel value
359 // self.error_value() for the instance type for T: TrieValue.
360 self.get32_ule(code_point)
361 .map(|t| T::from_unaligned(*t))
362 .unwrap_or(self.error_value)
363 }
364
365 /// Returns the value that is associated with `char` in this [`CodePointTrie`].
366 ///
367 /// # Examples
368 ///
369 /// ```
370 /// use icu::collections::codepointtrie::planes;
371 /// let trie = planes::get_planes_trie();
372 ///
373 /// assert_eq!(0, trie.get('A')); // 'A' as u32
374 /// assert_eq!(0, trie.get('Ꮰ')); // 'Ꮰ' as u32
375 /// assert_eq!(1, trie.get('𐁄')); // '𐁄' as u32
376 /// ```
377 #[inline(always)]
378 pub fn get(&self, c: char) -> T {
379 self.get32(u32::from(c))
380 }
381
382 /// Returns a reference to the ULE of the value that is associated with `code_point` in this [`CodePointTrie`].
383 ///
384 /// # Examples
385 ///
386 /// ```
387 /// use icu::collections::codepointtrie::planes;
388 /// let trie = planes::get_planes_trie();
389 ///
390 /// assert_eq!(Some(&0), trie.get32_ule(0x41)); // 'A' as u32
391 /// assert_eq!(Some(&0), trie.get32_ule(0x13E0)); // 'Ꮰ' as u32
392 /// assert_eq!(Some(&1), trie.get32_ule(0x10044)); // '𐁄' as u32
393 /// ```
394 #[inline(always)] // `always` based on normalizer benchmarking
395 pub fn get32_ule(&self, code_point: u32) -> Option<&T::ULE> {
396 // All code points up to the fast max limit are represented
397 // individually in the `index` array to hold their `data` array position, and
398 // thus only need 2 lookups for a [CodePointTrie::get()](`crate::codepointtrie::CodePointTrie::get`).
399 // Code points above the "fast max" limit require 4 lookups.
400 let fast_max = match self.header.trie_type {
401 TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX,
402 TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX,
403 };
404 let data_pos: u32 = if code_point <= fast_max {
405 Self::fast_index(self, code_point)
406 } else if code_point <= CODE_POINT_MAX {
407 Self::small_index(self, code_point)
408 } else {
409 self.trie_error_val_index()
410 };
411 // Returns the trie value (or trie's error value).
412 self.data.as_ule_slice().get(data_pos as usize)
413 }
414
415 /// Converts the [`CodePointTrie`] into one that returns another type of the same size.
416 ///
417 /// Borrowed data remains borrowed, and owned data remains owned.
418 ///
419 /// If the old and new types are not the same size, use
420 /// [`CodePointTrie::try_alloc_map_value`].
421 ///
422 /// # Panics
423 ///
424 /// Panics if `T` and `P` are different sizes.
425 ///
426 /// More specifically, panics if [`ZeroVec::try_into_converted()`] panics when converting
427 /// `ZeroVec<T>` into `ZeroVec<P>`, which happens if `T::ULE` and `P::ULE` differ in size.
428 ///
429 /// # Examples
430 ///
431 /// ```no_run
432 /// use icu::collections::codepointtrie::planes;
433 /// use icu::collections::codepointtrie::CodePointTrie;
434 ///
435 /// let planes_trie_u8: CodePointTrie<u8> = planes::get_planes_trie();
436 /// let planes_trie_i8: CodePointTrie<i8> =
437 /// planes_trie_u8.try_into_converted().expect("infallible");
438 ///
439 /// assert_eq!(planes_trie_i8.get32(0x30000), 3);
440 /// ```
441 pub fn try_into_converted<P>(self) -> Result<CodePointTrie<'trie, P>, ZeroVecError>
442 where
443 P: TrieValue,
444 {
445 let converted_data = self.data.try_into_converted()?;
446 let error_ule = self.error_value.to_unaligned();
447 let slice = &[error_ule];
448 let error_vec = ZeroVec::<T>::new_borrowed(slice);
449 let error_converted = error_vec.try_into_converted::<P>()?;
450 #[allow(clippy::expect_used)] // we know this cannot fail
451 Ok(CodePointTrie {
452 header: self.header,
453 index: self.index,
454 data: converted_data,
455 error_value: error_converted
456 .get(0)
457 .expect("vector known to have one element"),
458 })
459 }
460
461 /// Maps the [`CodePointTrie`] into one that returns a different type.
462 ///
463 /// This function returns owned data.
464 ///
465 /// If the old and new types are the same size, use the more efficient
466 /// [`CodePointTrie::try_into_converted`].
467 ///
468 /// # Examples
469 ///
470 /// ```
471 /// use icu::collections::codepointtrie::planes;
472 /// use icu::collections::codepointtrie::CodePointTrie;
473 ///
474 /// let planes_trie_u8: CodePointTrie<u8> = planes::get_planes_trie();
475 /// let planes_trie_u16: CodePointTrie<u16> = planes_trie_u8
476 /// .try_alloc_map_value(TryFrom::try_from)
477 /// .expect("infallible");
478 ///
479 /// assert_eq!(planes_trie_u16.get32(0x30000), 3);
480 /// ```
481 pub fn try_alloc_map_value<P, E>(
482 &self,
483 mut f: impl FnMut(T) -> Result<P, E>,
484 ) -> Result<CodePointTrie<'trie, P>, E>
485 where
486 P: TrieValue,
487 {
488 let error_converted = f(self.error_value)?;
489 let converted_data = self.data.iter().map(f).collect::<Result<ZeroVec<P>, E>>()?;
490 Ok(CodePointTrie {
491 header: self.header,
492 index: self.index.clone(),
493 data: converted_data,
494 error_value: error_converted,
495 })
496 }
497
498 /// Returns a [`CodePointMapRange`] struct which represents a range of code
499 /// points associated with the same trie value. The returned range will be
500 /// the longest stretch of consecutive code points starting at `start` that
501 /// share this value.
502 ///
503 /// This method is designed to use the internal details of
504 /// the structure of [`CodePointTrie`] to be optimally efficient. This will
505 /// outperform a naive approach that just uses [`CodePointTrie::get()`].
506 ///
507 /// This method provides lower-level functionality that can be used in the
508 /// implementation of other methods that are more convenient to the user.
509 /// To obtain an optimal partition of the code point space for
510 /// this trie resulting in the fewest number of ranges, see
511 /// [`CodePointTrie::iter_ranges()`].
512 ///
513 /// # Examples
514 ///
515 /// ```
516 /// use icu::collections::codepointtrie::planes;
517 ///
518 /// let trie = planes::get_planes_trie();
519 ///
520 /// const CODE_POINT_MAX: u32 = 0x10ffff;
521 /// let start = 0x1_0000;
522 /// let exp_end = 0x1_ffff;
523 ///
524 /// let start_val = trie.get32(start);
525 /// assert_eq!(trie.get32(exp_end), start_val);
526 /// assert_ne!(trie.get32(exp_end + 1), start_val);
527 ///
528 /// use icu::collections::codepointtrie::CodePointMapRange;
529 ///
530 /// let cpm_range: CodePointMapRange<u8> = trie.get_range(start).unwrap();
531 ///
532 /// assert_eq!(cpm_range.range.start(), &start);
533 /// assert_eq!(cpm_range.range.end(), &exp_end);
534 /// assert_eq!(cpm_range.value, start_val);
535 ///
536 /// // `start` can be any code point, whether or not it lies on the boundary
537 /// // of a maximally large range that still contains `start`
538 ///
539 /// let submaximal_1_start = start + 0x1234;
540 /// let submaximal_1 = trie.get_range(submaximal_1_start).unwrap();
541 /// assert_eq!(submaximal_1.range.start(), &0x1_1234);
542 /// assert_eq!(submaximal_1.range.end(), &0x1_ffff);
543 /// assert_eq!(submaximal_1.value, start_val);
544 ///
545 /// let submaximal_2_start = start + 0xffff;
546 /// let submaximal_2 = trie.get_range(submaximal_2_start).unwrap();
547 /// assert_eq!(submaximal_2.range.start(), &0x1_ffff);
548 /// assert_eq!(submaximal_2.range.end(), &0x1_ffff);
549 /// assert_eq!(submaximal_2.value, start_val);
550 /// ```
551 pub fn get_range(&self, start: u32) -> Option<CodePointMapRange<T>> {
552 // Exit early if the start code point is out of range, or if it is
553 // in the last range of code points in high_start..=CODE_POINT_MAX
554 // (start- and end-inclusive) that all share the same trie value.
555 if CODE_POINT_MAX < start {
556 return None;
557 }
558 if start >= self.header.high_start {
559 let di: usize = self.data.len() - (HIGH_VALUE_NEG_DATA_OFFSET as usize);
560 let value: T = self.data.get(di)?;
561 return Some(CodePointMapRange {
562 range: RangeInclusive::new(start, CODE_POINT_MAX),
563 value,
564 });
565 }
566
567 let null_value: T = T::try_from_u32(self.header.null_value).ok()?;
568
569 let mut prev_i3_block: u32 = u32::MAX; // using u32::MAX (instead of -1 as an i32 in ICU)
570 let mut prev_block: u32 = u32::MAX; // using u32::MAX (instead of -1 as an i32 in ICU)
571 let mut c: u32 = start;
572 let mut trie_value: T = self.error_value();
573 let mut value: T = self.error_value();
574 let mut have_value: bool = false;
575
576 loop {
577 let i3_block: u32;
578 let mut i3: u32;
579 let i3_block_length: u32;
580 let data_block_length: u32;
581
582 // Initialize values before beginning the iteration in the subsequent
583 // `loop` block. In particular, use the "i3*" local variables
584 // (representing the `index` array position's offset + increment
585 // for a 3rd-level trie lookup) to help initialize the data block
586 // variable `block` in the loop for the `data` array.
587 //
588 // When a lookup code point is <= the trie's *_FAST_INDEXING_MAX that
589 // corresponds to its `trie_type`, the lookup only takes 2 steps
590 // (once into the `index`, once into the `data` array); otherwise,
591 // takes 4 steps (3 iterative lookups into the `index`, once more
592 // into the `data` array). So for convenience's sake, when we have the
593 // 2-stage lookup, reuse the "i3*" variable names for the first lookup.
594 if c <= 0xffff
595 && (self.header.trie_type == TrieType::Fast || c <= SMALL_TYPE_FAST_INDEXING_MAX)
596 {
597 i3_block = 0;
598 i3 = c >> FAST_TYPE_SHIFT;
599 i3_block_length = if self.header.trie_type == TrieType::Fast {
600 BMP_INDEX_LENGTH
601 } else {
602 SMALL_INDEX_LENGTH
603 };
604 data_block_length = FAST_TYPE_DATA_BLOCK_LENGTH;
605 } else {
606 // Use the multi-stage index.
607 let mut i1: u32 = c >> SHIFT_1;
608 if self.header.trie_type == TrieType::Fast {
609 debug_assert!(0xffff < c && c < self.header.high_start);
610 i1 = i1 + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH;
611 } else {
612 debug_assert!(
613 c < self.header.high_start && self.header.high_start > SMALL_LIMIT
614 );
615 i1 += SMALL_INDEX_LENGTH;
616 }
617 let i2: u16 = self.index.get(i1 as usize)?;
618 let i3_block_idx: u32 = (i2 as u32) + ((c >> SHIFT_2) & INDEX_2_MASK);
619 i3_block = if let Some(i3b) = self.index.get(i3_block_idx as usize) {
620 i3b as u32
621 } else {
622 return None;
623 };
624 if i3_block == prev_i3_block && (c - start) >= CP_PER_INDEX_2_ENTRY {
625 // The index-3 block is the same as the previous one, and filled with value.
626 debug_assert!((c & (CP_PER_INDEX_2_ENTRY - 1)) == 0);
627 c += CP_PER_INDEX_2_ENTRY;
628
629 if c >= self.header.high_start {
630 break;
631 } else {
632 continue;
633 }
634 }
635 prev_i3_block = i3_block;
636 if i3_block == self.header.index3_null_offset as u32 {
637 // This is the index-3 null block.
638 // All of the `data` array blocks pointed to by the values
639 // in this block of the `index` 3rd-stage subarray will
640 // contain this trie's null_value. So if we are in the middle
641 // of a range, end it and return early, otherwise start a new
642 // range of null values.
643 if have_value {
644 if null_value != value {
645 return Some(CodePointMapRange {
646 range: RangeInclusive::new(start, c - 1),
647 value,
648 });
649 }
650 } else {
651 trie_value = T::try_from_u32(self.header.null_value).ok()?;
652 value = null_value;
653 have_value = true;
654 }
655 prev_block = self.header.data_null_offset;
656 c = (c + CP_PER_INDEX_2_ENTRY) & !(CP_PER_INDEX_2_ENTRY - 1);
657
658 if c >= self.header.high_start {
659 break;
660 } else {
661 continue;
662 }
663 }
664 i3 = (c >> SHIFT_3) & INDEX_3_MASK;
665 i3_block_length = INDEX_3_BLOCK_LENGTH;
666 data_block_length = SMALL_DATA_BLOCK_LENGTH;
667 }
668
669 // Enumerate data blocks for one index-3 block.
670 loop {
671 let mut block: u32;
672 if (i3_block & 0x8000) == 0 {
673 block = if let Some(b) = self.index.get((i3_block + i3) as usize) {
674 b as u32
675 } else {
676 return None;
677 };
678 } else {
679 // 18-bit indexes stored in groups of 9 entries per 8 indexes.
680 let mut group: u32 = (i3_block & 0x7fff) + (i3 & !7) + (i3 >> 3);
681 let gi: u32 = i3 & 7;
682 let gi_val: u32 = if let Some(giv) = self.index.get(group as usize) {
683 giv.into()
684 } else {
685 return None;
686 };
687 block = (gi_val << (2 + (2 * gi))) & 0x30000;
688 group += 1;
689 let ggi_val: u32 = if let Some(ggiv) = self.index.get((group + gi) as usize) {
690 ggiv as u32
691 } else {
692 return None;
693 };
694 block |= ggi_val;
695 }
696
697 // If our previous and current return values of the 3rd-stage `index`
698 // lookup yield the same `data` block offset, and if we already know that
699 // the entire `data` block / subarray starting at that offset stores
700 // `value` and nothing else, then we can extend our range by the length
701 // of a data block and continue.
702 // Otherwise, we have to iterate over the values stored in the
703 // new data block to see if they differ from `value`.
704 if block == prev_block && (c - start) >= data_block_length {
705 // The block is the same as the previous one, and filled with value.
706 debug_assert!((c & (data_block_length - 1)) == 0);
707 c += data_block_length;
708 } else {
709 let data_mask: u32 = data_block_length - 1;
710 prev_block = block;
711 if block == self.header.data_null_offset {
712 // This is the data null block.
713 // If we are in the middle of a range, end it and
714 // return early, otherwise start a new range of null
715 // values.
716 if have_value {
717 if null_value != value {
718 return Some(CodePointMapRange {
719 range: RangeInclusive::new(start, c - 1),
720 value,
721 });
722 }
723 } else {
724 trie_value = T::try_from_u32(self.header.null_value).ok()?;
725 value = null_value;
726 have_value = true;
727 }
728 c = (c + data_block_length) & !data_mask;
729 } else {
730 let mut di: u32 = block + (c & data_mask);
731 let mut trie_value_2: T = self.data.get(di as usize)?;
732 if have_value {
733 if trie_value_2 != trie_value {
734 if maybe_filter_value(
735 trie_value_2,
736 T::try_from_u32(self.header.null_value).ok()?,
737 null_value,
738 ) != value
739 {
740 return Some(CodePointMapRange {
741 range: RangeInclusive::new(start, c - 1),
742 value,
743 });
744 }
745 // `trie_value` stores the previous value that was retrieved
746 // from the trie.
747 // `value` stores the value associated for the range (return
748 // value) that we are currently building, which is computed
749 // as a transformation by applying maybe_filter_value()
750 // to the trie value.
751 // The current trie value `trie_value_2` within this data block
752 // differs here from the previous value in `trie_value`.
753 // But both map to `value` after applying `maybe_filter_value`.
754 // It is not clear whether the previous or the current trie value
755 // (or neither) is more likely to match potential subsequent trie
756 // values that would extend the range by mapping to `value`.
757 // On the assumption of locality -- often times consecutive
758 // characters map to the same trie values -- remembering the new
759 // one might make it faster to extend this range further
760 // (by increasing the chance that the next `trie_value_2 !=
761 // trie_value` test will be false).
762 trie_value = trie_value_2; // may or may not help
763 }
764 } else {
765 trie_value = trie_value_2;
766 value = maybe_filter_value(
767 trie_value_2,
768 T::try_from_u32(self.header.null_value).ok()?,
769 null_value,
770 );
771 have_value = true;
772 }
773
774 c += 1;
775 while (c & data_mask) != 0 {
776 di += 1;
777 trie_value_2 = self.data.get(di as usize)?;
778 if trie_value_2 != trie_value {
779 if maybe_filter_value(
780 trie_value_2,
781 T::try_from_u32(self.header.null_value).ok()?,
782 null_value,
783 ) != value
784 {
785 return Some(CodePointMapRange {
786 range: RangeInclusive::new(start, c - 1),
787 value,
788 });
789 }
790 // `trie_value` stores the previous value that was retrieved
791 // from the trie.
792 // `value` stores the value associated for the range (return
793 // value) that we are currently building, which is computed
794 // as a transformation by applying maybe_filter_value()
795 // to the trie value.
796 // The current trie value `trie_value_2` within this data block
797 // differs here from the previous value in `trie_value`.
798 // But both map to `value` after applying `maybe_filter_value`.
799 // It is not clear whether the previous or the current trie value
800 // (or neither) is more likely to match potential subsequent trie
801 // values that would extend the range by mapping to `value`.
802 // On the assumption of locality -- often times consecutive
803 // characters map to the same trie values -- remembering the new
804 // one might make it faster to extend this range further
805 // (by increasing the chance that the next `trie_value_2 !=
806 // trie_value` test will be false).
807 trie_value = trie_value_2; // may or may not help
808 }
809
810 c += 1;
811 }
812 }
813 }
814
815 i3 += 1;
816 if i3 >= i3_block_length {
817 break;
818 }
819 }
820
821 if c >= self.header.high_start {
822 break;
823 }
824 }
825
826 debug_assert!(have_value);
827
828 // Now that c >= high_start, compare `value` to `high_value` to see
829 // if we can merge our current range with the high_value range
830 // high_start..=CODE_POINT_MAX (start- and end-inclusive), otherwise
831 // stop at high_start - 1.
832 let di: u32 = self.data.len() as u32 - HIGH_VALUE_NEG_DATA_OFFSET;
833 let high_value: T = self.data.get(di as usize)?;
834 if maybe_filter_value(
835 high_value,
836 T::try_from_u32(self.header.null_value).ok()?,
837 null_value,
838 ) != value
839 {
840 c -= 1;
841 } else {
842 c = CODE_POINT_MAX;
843 }
844 Some(CodePointMapRange {
845 range: RangeInclusive::new(start, c),
846 value,
847 })
848 }
849
850 /// Yields an [`Iterator`] returning ranges of consecutive code points that
851 /// share the same value in the [`CodePointTrie`], as given by
852 /// [`CodePointTrie::get_range()`].
853 ///
854 /// # Examples
855 ///
856 /// ```
857 /// use core::ops::RangeInclusive;
858 /// use icu::collections::codepointtrie::planes;
859 /// use icu::collections::codepointtrie::CodePointMapRange;
860 ///
861 /// let planes_trie = planes::get_planes_trie();
862 ///
863 /// let mut ranges = planes_trie.iter_ranges();
864 ///
865 /// for plane in 0..=16 {
866 /// let exp_start = plane * 0x1_0000;
867 /// let exp_end = exp_start + 0xffff;
868 /// assert_eq!(
869 /// ranges.next(),
870 /// Some(CodePointMapRange {
871 /// range: RangeInclusive::new(exp_start, exp_end),
872 /// value: plane as u8
873 /// })
874 /// );
875 /// }
876 ///
877 /// // Hitting the end of the iterator returns `None`, as will subsequent
878 /// // calls to .next().
879 /// assert_eq!(ranges.next(), None);
880 /// assert_eq!(ranges.next(), None);
881 /// ```
882 pub fn iter_ranges(&self) -> CodePointMapRangeIterator<T> {
883 let init_range = Some(CodePointMapRange {
884 range: RangeInclusive::new(u32::MAX, u32::MAX),
885 value: self.error_value(),
886 });
887 CodePointMapRangeIterator::<T> {
888 cpt: self,
889 cpm_range: init_range,
890 }
891 }
892
893 /// Yields an [`Iterator`] returning the ranges of the code points whose values
894 /// match `value` in the [`CodePointTrie`].
895 ///
896 /// # Examples
897 ///
898 /// ```
899 /// use icu::collections::codepointtrie::planes;
900 ///
901 /// let trie = planes::get_planes_trie();
902 ///
903 /// let plane_val = 2;
904 /// let mut sip_range_iter = trie.get_ranges_for_value(plane_val as u8);
905 ///
906 /// let start = plane_val * 0x1_0000;
907 /// let end = start + 0xffff;
908 ///
909 /// let sip_range = sip_range_iter.next()
910 /// .expect("Plane 2 (SIP) should exist in planes data");
911 /// assert_eq!(start..=end, sip_range);
912 ///
913 /// assert!(sip_range_iter.next().is_none());
914 pub fn get_ranges_for_value(&self, value: T) -> impl Iterator<Item = RangeInclusive<u32>> + '_ {
915 self.iter_ranges()
916 .filter(move |cpm_range| cpm_range.value == value)
917 .map(|cpm_range| cpm_range.range)
918 }
919
920 /// Yields an [`Iterator`] returning the ranges of the code points after passing
921 /// the value through a mapping function.
922 ///
923 /// This is preferable to calling `.get_ranges().map()` since it will coalesce
924 /// adjacent ranges into one.
925 ///
926 /// # Examples
927 ///
928 /// ```
929 /// use icu::collections::codepointtrie::planes;
930 ///
931 /// let trie = planes::get_planes_trie();
932 ///
933 /// let plane_val = 2;
934 /// let mut sip_range_iter = trie.iter_ranges_mapped(|value| value != plane_val as u8).filter(|range| range.value);
935 ///
936 /// let end = plane_val * 0x1_0000 - 1;
937 ///
938 /// let sip_range = sip_range_iter.next()
939 /// .expect("Complemented planes data should have at least one entry");
940 /// assert_eq!(0..=end, sip_range.range);
941 pub fn iter_ranges_mapped<'a, U: Eq + 'a>(
942 &'a self,
943 mut map: impl FnMut(T) -> U + Copy + 'a,
944 ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
945 crate::iterator_utils::RangeListIteratorCoalescer::new(self.iter_ranges().map(
946 move |range| CodePointMapRange {
947 range: range.range,
948 value: map(range.value),
949 },
950 ))
951 }
952
953 /// Returns a [`CodePointInversionList`] for the code points that have the given
954 /// [`TrieValue`] in the trie.
955 ///
956 /// # Examples
957 ///
958 /// ```
959 /// use icu::collections::codepointtrie::planes;
960 ///
961 /// let trie = planes::get_planes_trie();
962 ///
963 /// let plane_val = 2;
964 /// let sip = trie.get_set_for_value(plane_val as u8);
965 ///
966 /// let start = plane_val * 0x1_0000;
967 /// let end = start + 0xffff;
968 ///
969 /// assert!(!sip.contains32(start - 1));
970 /// assert!(sip.contains32(start));
971 /// assert!(sip.contains32(end));
972 /// assert!(!sip.contains32(end + 1));
973 /// ```
974 pub fn get_set_for_value(&self, value: T) -> CodePointInversionList<'static> {
975 let value_ranges = self.get_ranges_for_value(value);
976 CodePointInversionList::from_iter(value_ranges)
977 }
978
979 /// Returns the value used as an error value for this trie
980 #[inline]
981 pub fn error_value(&self) -> T {
982 self.error_value
983 }
984}
985
986#[cfg(feature = "databake")]
987impl<'trie, T: TrieValue + databake::Bake> databake::Bake for CodePointTrie<'trie, T> {
988 fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
989 let header = self.header.bake(env);
990 let index = self.index.bake(env);
991 let data = self.data.bake(env);
992 let error_value = self.error_value.bake(env);
993 databake::quote! { icu_collections::codepointtrie::CodePointTrie::from_parts(#header, #index, #data, #error_value) }
994 }
995}
996
997impl<'trie, T: TrieValue + Into<u32>> CodePointTrie<'trie, T> {
998 /// Returns the value that is associated with `code_point` for this [`CodePointTrie`]
999 /// as a `u32`.
1000 ///
1001 /// # Examples
1002 ///
1003 /// ```
1004 /// use icu::collections::codepointtrie::planes;
1005 /// let trie = planes::get_planes_trie();
1006 ///
1007 /// let cp = '𑖎' as u32;
1008 /// assert_eq!(cp, 0x1158E);
1009 ///
1010 /// let plane_num: u8 = trie.get32(cp);
1011 /// assert_eq!(trie.get32_u32(cp), plane_num as u32);
1012 /// ```
1013 // Note: This API method maintains consistency with the corresponding
1014 // original ICU APIs.
1015 pub fn get32_u32(&self, code_point: u32) -> u32 {
1016 self.get32(code_point).into()
1017 }
1018}
1019
1020impl<'trie, T: TrieValue> Clone for CodePointTrie<'trie, T>
1021where
1022 <T as zerovec::ule::AsULE>::ULE: Clone,
1023{
1024 fn clone(&self) -> Self {
1025 CodePointTrie {
1026 header: self.header,
1027 index: self.index.clone(),
1028 data: self.data.clone(),
1029 error_value: self.error_value,
1030 }
1031 }
1032}
1033
1034/// Represents a range of consecutive code points sharing the same value in a
1035/// code point map. The start and end of the interval is represented as a
1036/// `RangeInclusive<u32>`, and the value is represented as `T`.
1037#[derive(PartialEq, Eq, Debug, Clone)]
1038pub struct CodePointMapRange<T> {
1039 /// Range of code points from start to end (inclusive).
1040 pub range: RangeInclusive<u32>,
1041 /// Trie value associated with this range.
1042 pub value: T,
1043}
1044
1045/// A custom [`Iterator`] type specifically for a code point trie that returns
1046/// [`CodePointMapRange`]s.
1047pub struct CodePointMapRangeIterator<'a, T: TrieValue> {
1048 cpt: &'a CodePointTrie<'a, T>,
1049 // Initialize `range` to Some(CodePointMapRange{ start: u32::MAX, end: u32::MAX, value: 0}).
1050 // When `range` is Some(...) and has a start value different from u32::MAX, then we have
1051 // returned at least one code point range due to a call to `next()`.
1052 // When `range` == `None`, it means that we have hit the end of iteration. It would occur
1053 // after a call to `next()` returns a None <=> we attempted to call `get_range()`
1054 // with a start code point that is > CODE_POINT_MAX.
1055 cpm_range: Option<CodePointMapRange<T>>,
1056}
1057
1058impl<'a, T: TrieValue> Iterator for CodePointMapRangeIterator<'a, T> {
1059 type Item = CodePointMapRange<T>;
1060
1061 fn next(&mut self) -> Option<Self::Item> {
1062 self.cpm_range = match &self.cpm_range {
1063 Some(cpmr) => {
1064 if *cpmr.range.start() == u32::MAX {
1065 self.cpt.get_range(0)
1066 } else {
1067 self.cpt.get_range(cpmr.range.end() + 1)
1068 }
1069 }
1070 None => None,
1071 };
1072 // Note: Clone is cheap. We can't Copy because RangeInclusive does not impl Copy.
1073 self.cpm_range.clone()
1074 }
1075}
1076
1077#[cfg(test)]
1078mod tests {
1079 use super::*;
1080 use crate::codepointtrie::planes;
1081 use alloc::vec::Vec;
1082
1083 #[test]
1084 #[cfg(feature = "serde")]
1085 fn test_serde_with_postcard_roundtrip() -> Result<(), postcard::Error> {
1086 let trie = crate::codepointtrie::planes::get_planes_trie();
1087 let trie_serialized: Vec<u8> = postcard::to_allocvec(&trie).unwrap();
1088
1089 // Assert an expected (golden data) version of the serialized trie.
1090 const EXP_TRIE_SERIALIZED: &[u8] = &[
1091 128, 128, 64, 128, 2, 2, 0, 0, 1, 160, 18, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1092 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1093 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1094 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1095 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 136,
1096 2, 144, 2, 144, 2, 144, 2, 176, 2, 176, 2, 176, 2, 176, 2, 208, 2, 208, 2, 208, 2, 208,
1097 2, 240, 2, 240, 2, 240, 2, 240, 2, 16, 3, 16, 3, 16, 3, 16, 3, 48, 3, 48, 3, 48, 3, 48,
1098 3, 80, 3, 80, 3, 80, 3, 80, 3, 112, 3, 112, 3, 112, 3, 112, 3, 144, 3, 144, 3, 144, 3,
1099 144, 3, 176, 3, 176, 3, 176, 3, 176, 3, 208, 3, 208, 3, 208, 3, 208, 3, 240, 3, 240, 3,
1100 240, 3, 240, 3, 16, 4, 16, 4, 16, 4, 16, 4, 48, 4, 48, 4, 48, 4, 48, 4, 80, 4, 80, 4,
1101 80, 4, 80, 4, 112, 4, 112, 4, 112, 4, 112, 4, 0, 0, 16, 0, 32, 0, 48, 0, 64, 0, 80, 0,
1102 96, 0, 112, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32,
1103 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48,
1104 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 128, 0, 128, 0, 128, 0, 128,
1105 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128,
1106 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128,
1107 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144,
1108 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144,
1109 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144,
1110 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160,
1111 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160,
1112 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160,
1113 0, 160, 0, 160, 0, 160, 0, 160, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176,
1114 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176,
1115 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176,
1116 0, 176, 0, 176, 0, 176, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192,
1117 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192,
1118 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192,
1119 0, 192, 0, 192, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208,
1120 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208,
1121 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208,
1122 0, 208, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224,
1123 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224,
1124 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224,
1125 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240,
1126 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240,
1127 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 0,
1128 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
1129 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1130 1, 0, 1, 0, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1,
1131 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16,
1132 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 32, 1, 32, 1, 32, 1,
1133 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32,
1134 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1,
1135 32, 1, 32, 1, 32, 1, 32, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48,
1136 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1,
1137 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 64, 1, 64,
1138 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1,
1139 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64,
1140 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1,
1141 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80,
1142 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1,
1143 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96,
1144 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1,
1145 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 128, 0, 136, 0, 136, 0, 136, 0, 136,
1146 0, 136, 0, 136, 0, 136, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0,
1147 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
1148 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0,
1149 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0,
1150 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0,
1151 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0,
1152 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0,
1153 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0,
1154 200, 0, 200, 0, 200, 0, 200, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0,
1155 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0,
1156 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0,
1157 232, 0, 232, 0, 232, 0, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8,
1158 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1,
1159 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40,
1160 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1,
1161 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40,
1162 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1,
1163 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72,
1164 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 104, 1, 104, 1, 104, 1, 104, 1,
1165 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1,
1166 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1,
1167 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1,
1168 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1,
1169 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1,
1170 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1,
1171 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1,
1172 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1,
1173 168, 1, 168, 1, 168, 1, 168, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1,
1174 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1,
1175 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1,
1176 200, 1, 200, 1, 200, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1,
1177 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1,
1178 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1,
1179 232, 1, 232, 1, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2,
1180 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8,
1181 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40,
1182 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2,
1183 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 72,
1184 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2,
1185 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72,
1186 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2,
1187 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2,
1188 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2,
1189 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 244, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1194 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1195 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
1196 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
1197 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8,
1198 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10,
1199 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11,
1200 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
1201 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14,
1202 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1203 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 0,
1204 ];
1205 assert_eq!(trie_serialized, EXP_TRIE_SERIALIZED);
1206
1207 let trie_deserialized = postcard::from_bytes::<CodePointTrie<u8>>(&trie_serialized)?;
1208
1209 assert_eq!(&trie.index, &trie_deserialized.index);
1210 assert_eq!(&trie.data, &trie_deserialized.data);
1211
1212 assert!(!trie_deserialized.index.is_owned());
1213 assert!(!trie_deserialized.data.is_owned());
1214
1215 Ok(())
1216 }
1217
1218 #[test]
1219 fn test_get_range() {
1220 let planes_trie = planes::get_planes_trie();
1221
1222 let first_range: Option<CodePointMapRange<u8>> = planes_trie.get_range(0x0);
1223 assert_eq!(
1224 first_range,
1225 Some(CodePointMapRange {
1226 range: RangeInclusive::new(0x0, 0xffff),
1227 value: 0
1228 })
1229 );
1230
1231 let second_range: Option<CodePointMapRange<u8>> = planes_trie.get_range(0x1_0000);
1232 assert_eq!(
1233 second_range,
1234 Some(CodePointMapRange {
1235 range: RangeInclusive::new(0x10000, 0x1ffff),
1236 value: 1
1237 })
1238 );
1239
1240 let penultimate_range: Option<CodePointMapRange<u8>> = planes_trie.get_range(0xf_0000);
1241 assert_eq!(
1242 penultimate_range,
1243 Some(CodePointMapRange {
1244 range: RangeInclusive::new(0xf_0000, 0xf_ffff),
1245 value: 15
1246 })
1247 );
1248
1249 let last_range: Option<CodePointMapRange<u8>> = planes_trie.get_range(0x10_0000);
1250 assert_eq!(
1251 last_range,
1252 Some(CodePointMapRange {
1253 range: RangeInclusive::new(0x10_0000, 0x10_ffff),
1254 value: 16
1255 })
1256 );
1257 }
1258
1259 #[test]
1260 fn databake() {
1261 databake::test_bake!(
1262 CodePointTrie<'static, u32>,
1263 const: crate::codepointtrie::CodePointTrie::from_parts(
1264 crate::codepointtrie::CodePointTrieHeader {
1265 high_start: 1u32,
1266 shifted12_high_start: 2u16,
1267 index3_null_offset: 3u16,
1268 data_null_offset: 4u32,
1269 null_value: 5u32,
1270 trie_type: crate::codepointtrie::TrieType::Small,
1271 },
1272 zerovec::ZeroVec::new(),
1273 zerovec::ZeroVec::new(),
1274 0u32,
1275 ),
1276 icu_collections,
1277 [zerovec],
1278 );
1279 }
1280}