icu_collections/char16trie/trie.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use zerofrom::ZeroFrom;
6use zerovec::{ZeroSlice, ZeroVec};
7
8// Match-node lead unit values, after masking off intermediate-value bits:
9
10// 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
11// the length is one more than the next byte.
12
13// For a branch sub-node with at most this many entries, we drop down
14// to a linear search.
15const MAX_BRANCH_LINEAR_SUB_NODE_LENGTH: usize = 5;
16
17// 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
18const MIN_LINEAR_MATCH: u16 = 0x30;
19const MAX_LINEAR_MATCH_LENGTH: u16 = 0x10;
20
21// Match-node lead unit bits 14..6 for the optional intermediate value.
22// If these bits are 0, then there is no intermediate value.
23// Otherwise, see the *NodeValue* constants below.
24const MIN_VALUE_LEAD: u16 = MIN_LINEAR_MATCH + MAX_LINEAR_MATCH_LENGTH; // 0x40
25const NODE_TYPE_MASK: u16 = MIN_VALUE_LEAD - 1; // 0x003f
26
27// A final-value node has bit 15 set.
28const VALUE_IS_FINAL: u16 = 0x8000;
29
30// Compact value: After testing bit 0, shift right by 15 and then use the following thresholds.
31const MAX_ONE_UNIT_VALUE: u16 = 0x3fff;
32
33const MIN_TWO_UNIT_VALUE_LEAD: u16 = MAX_ONE_UNIT_VALUE + 1; // 0x4000
34
35const MAX_ONE_UNIT_NODE_VALUE: u16 = 0xff;
36
37const MIN_TWO_UNIT_NODE_VALUE_LEAD: u16 = MIN_VALUE_LEAD + ((MAX_ONE_UNIT_NODE_VALUE + 1) << 6); // 0x4040
38
39const THREE_UNIT_NODE_VALUE_LEAD: u16 = 0x7fc0;
40
41const THREE_UNIT_VALUE_LEAD: u16 = 0x7fff;
42
43// Compact delta integers.
44const MAX_ONE_UNIT_DELTA: u16 = 0xfbff;
45const MIN_TWO_UNIT_DELTA_LEAD: u16 = MAX_ONE_UNIT_DELTA + 1; // 0xfc00
46const THREE_UNIT_DELTA_LEAD: u16 = 0xffff;
47
48fn skip_value(pos: usize, lead: u16) -> usize {
49 if lead < MIN_TWO_UNIT_VALUE_LEAD {
50 pos
51 } else if lead < THREE_UNIT_VALUE_LEAD {
52 pos + 1
53 } else {
54 pos + 2
55 }
56}
57
58fn skip_node_value(pos: usize, lead: u16) -> usize {
59 if lead < MIN_TWO_UNIT_NODE_VALUE_LEAD {
60 pos
61 } else if lead < THREE_UNIT_NODE_VALUE_LEAD {
62 pos + 1
63 } else {
64 pos + 2
65 }
66}
67
68/// This struct represents a de-serialized `Char16Trie` that was exported from
69/// ICU binary data.
70///
71/// Light-weight, non-const reader class for a `CharsTrie`. Traverses a
72/// char-serialized data structure with minimal state, for mapping 16-bit-unit
73/// sequences to non-negative integer values.
74///
75/// For more information:
76/// - [ICU4C UCharsTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UCharsTrie.html)
77/// - [ICU4J CharsTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4j/com/ibm/icu/util/CharsTrie.html) API.
78#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
79#[cfg_attr(feature = "databake", derive(databake::Bake), databake(path = icu_collections::char16trie))]
80#[derive(Clone, Debug, PartialEq, Eq, ZeroFrom)]
81pub struct Char16Trie<'data> {
82 /// An array of u16 containing the trie data.
83 #[cfg_attr(feature = "serde", serde(borrow))]
84 #[doc(hidden)] // #2417
85 pub data: ZeroVec<'data, u16>,
86}
87
88impl<'data> Char16Trie<'data> {
89 /// Returns a new [`Char16Trie`] with ownership of the provided data.
90 pub fn new(data: ZeroVec<'data, u16>) -> Self {
91 Self { data }
92 }
93
94 /// Returns a new [`Char16TrieIterator`] backed by borrowed data from the `trie` data
95 pub fn iter(&self) -> Char16TrieIterator {
96 Char16TrieIterator::new(&self.data)
97 }
98}
99
100/// This struct represents an iterator over a [`Char16Trie`].
101#[derive(Clone)]
102pub struct Char16TrieIterator<'a> {
103 /// A reference to the Char16Trie data to iterate over.
104 trie: &'a ZeroSlice<u16>,
105 /// Index of next trie unit to read, or `None` if there are no more matches.
106 pos: Option<usize>,
107 /// Remaining length of a linear-match node, minus 1, or `None` if not in
108 /// such a node.
109 remaining_match_length: Option<usize>,
110}
111
112/// An enum representing the return value from a lookup in [`Char16Trie`].
113#[derive(Clone, Copy, Debug, PartialEq)]
114pub enum TrieResult {
115 /// The input unit(s) did not continue a matching string.
116 /// Once `next()` returns `TrieResult::NoMatch`, all further calls to `next()`
117 /// will also return `TrieResult::NoMatch`.
118 NoMatch,
119 /// The input unit(s) matched a string but there is no value for the string
120 /// so far. (It is a prefix of a longer string.)
121 NoValue,
122 /// The input unit(s) continued a matching string and there is a value for
123 /// the string so far. No further input byte/unit can continue a matching
124 /// string.
125 FinalValue(i32),
126 /// The input unit(s) continued a matching string and there is a value for
127 /// the string so far. Another input byte/unit can continue a matching
128 /// string.
129 Intermediate(i32),
130}
131
132// Get the lead surrogate (0xd800..0xdbff) for a
133// supplementary code point (0x10000..0x10ffff).
134// @param supplementary 32-bit code point (U+10000..U+10ffff)
135// @return lead surrogate (U+d800..U+dbff) for supplementary
136fn u16_lead(supplementary: i32) -> u16 {
137 (((supplementary) >> 10) + 0xd7c0) as u16
138}
139
140// Get the trail surrogate (0xdc00..0xdfff) for a
141// supplementary code point (0x10000..0x10ffff).
142// @param supplementary 32-bit code point (U+10000..U+10ffff)
143// @return trail surrogate (U+dc00..U+dfff) for supplementary
144fn u16_tail(supplementary: i32) -> u16 {
145 (((supplementary) & 0x3ff) | 0xdc00) as u16
146}
147
148/// A macro that takes an `Option` argument and either unwraps it if it has a value or
149/// causes the function to return `TrieResult::NoMatch` if there is no value.
150/// This could perhaps be done with `std::ops::Try` once stabilized.
151macro_rules! trie_unwrap {
152 ($option:expr) => {
153 match $option {
154 Some(x) => x,
155 None => {
156 // Unexpected
157 debug_assert!(false);
158 return TrieResult::NoMatch;
159 }
160 }
161 };
162}
163
164impl<'a> Char16TrieIterator<'a> {
165 /// Returns a new [`Char16TrieIterator`] backed by borrowed data for the `trie` array
166 pub fn new(trie: &'a ZeroSlice<u16>) -> Self {
167 Self {
168 trie,
169 pos: Some(0),
170 remaining_match_length: None,
171 }
172 }
173
174 /// Traverses the trie from the current state for this input char.
175 ///
176 /// # Examples
177 ///
178 /// ```
179 /// use icu::collections::char16trie::{Char16Trie, TrieResult};
180 /// use zerovec::ZeroVec;
181 ///
182 /// // A Char16Trie containing the ASCII characters 'a' and 'b'.
183 /// let trie_data = [48, 97, 176, 98, 32868];
184 /// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data));
185 ///
186 /// let mut iter = trie.iter();
187 /// let res = iter.next('a');
188 /// assert_eq!(res, TrieResult::Intermediate(1));
189 /// let res = iter.next('b');
190 /// assert_eq!(res, TrieResult::FinalValue(100));
191 /// let res = iter.next('c');
192 /// assert_eq!(res, TrieResult::NoMatch);
193 /// ```
194 pub fn next(&mut self, c: char) -> TrieResult {
195 if (c as u32) <= 0xffff {
196 self.next16(c as u16)
197 } else {
198 match self.next16(u16_lead(c as i32)) {
199 TrieResult::NoValue | TrieResult::Intermediate(_) => {
200 self.next16(u16_tail(c as i32))
201 }
202 _ => TrieResult::NoMatch,
203 }
204 }
205 }
206
207 /// Traverses the trie from the current state for this input char.
208 ///
209 /// # Examples
210 ///
211 /// ```
212 /// use icu::collections::char16trie::{Char16Trie, TrieResult};
213 /// use zerovec::ZeroVec;
214 ///
215 /// // A Char16Trie containing the ASCII characters 'a' and 'b'.
216 /// let trie_data = [48, 97, 176, 98, 32868];
217 /// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data));
218 ///
219 /// let mut iter = trie.iter();
220 /// let res = iter.next('a');
221 /// assert_eq!(res, TrieResult::Intermediate(1));
222 /// let res = iter.next('b');
223 /// assert_eq!(res, TrieResult::FinalValue(100));
224 /// let res = iter.next('c');
225 /// assert_eq!(res, TrieResult::NoMatch);
226 /// ```
227 pub fn next32(&mut self, c: u32) -> TrieResult {
228 if c <= 0xffff {
229 self.next16(c as u16)
230 } else {
231 match self.next16(u16_lead(c as i32)) {
232 TrieResult::NoValue | TrieResult::Intermediate(_) => {
233 self.next16(u16_tail(c as i32))
234 }
235 _ => TrieResult::NoMatch,
236 }
237 }
238 }
239
240 /// Traverses the trie from the current state for this input char.
241 ///
242 /// # Examples
243 ///
244 /// ```
245 /// use icu::collections::char16trie::{Char16Trie, TrieResult};
246 /// use zerovec::ZeroVec;
247 ///
248 /// // A Char16Trie containing the ASCII characters 'a' and 'b'.
249 /// let trie_data = [48, 97, 176, 98, 32868];
250 /// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data));
251 ///
252 /// let mut iter = trie.iter();
253 /// let res = iter.next16('a' as u16);
254 /// assert_eq!(res, TrieResult::Intermediate(1));
255 /// let res = iter.next16('b' as u16);
256 /// assert_eq!(res, TrieResult::FinalValue(100));
257 /// let res = iter.next16('c' as u16);
258 /// assert_eq!(res, TrieResult::NoMatch);
259 /// ```
260 pub fn next16(&mut self, c: u16) -> TrieResult {
261 let mut pos = match self.pos {
262 Some(p) => p,
263 None => return TrieResult::NoMatch,
264 };
265 if let Some(length) = self.remaining_match_length {
266 // Remaining part of a linear-match node
267 if c == trie_unwrap!(self.trie.get(pos)) {
268 pos += 1;
269 self.pos = Some(pos);
270 if length == 0 {
271 self.remaining_match_length = None;
272 let node = trie_unwrap!(self.trie.get(pos));
273 if node >= MIN_VALUE_LEAD {
274 return self.value_result(pos);
275 }
276 } else {
277 self.remaining_match_length = Some(length - 1);
278 }
279 return TrieResult::NoValue;
280 }
281 self.stop();
282 TrieResult::NoMatch
283 } else {
284 self.next_impl(pos, c)
285 }
286 }
287
288 fn branch_next(&mut self, pos: usize, length: usize, in_unit: u16) -> TrieResult {
289 let mut pos = pos;
290 let mut length = length;
291 if length == 0 {
292 length = trie_unwrap!(self.trie.get(pos)) as usize;
293 pos += 1;
294 }
295 length += 1;
296
297 // The length of the branch is the number of units to select from.
298 // The data structure encodes a binary search.
299 while length > MAX_BRANCH_LINEAR_SUB_NODE_LENGTH {
300 if in_unit < trie_unwrap!(self.trie.get(pos)) {
301 length >>= 1;
302 pos = trie_unwrap!(self.jump_by_delta(pos + 1));
303 } else {
304 length = length - (length >> 1);
305 pos = trie_unwrap!(self.skip_delta(pos + 1));
306 }
307 }
308 // Drop down to linear search for the last few bytes.
309 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
310 // and divides length by 2.
311 loop {
312 if in_unit == trie_unwrap!(self.trie.get(pos)) {
313 pos += 1;
314 let mut node = trie_unwrap!(self.trie.get(pos));
315 if node & VALUE_IS_FINAL != 0 {
316 self.pos = Some(pos);
317 return self.value_result(pos);
318 }
319 // Use the non-final value as the jump delta.
320 pos += 1;
321
322 if node < MIN_TWO_UNIT_VALUE_LEAD {
323 pos += node as usize;
324 } else if node < THREE_UNIT_VALUE_LEAD {
325 pos += (((node - MIN_TWO_UNIT_VALUE_LEAD) as u32) << 16) as usize
326 | trie_unwrap!(self.trie.get(pos)) as usize;
327 pos += 1;
328 } else {
329 pos += (trie_unwrap!(self.trie.get(pos)) as usize) << 16
330 | trie_unwrap!(self.trie.get(pos + 1)) as usize;
331 pos += 2;
332 }
333 node = trie_unwrap!(self.trie.get(pos));
334 self.pos = Some(pos);
335
336 if node >= MIN_VALUE_LEAD {
337 return self.value_result(pos);
338 }
339 return TrieResult::NoValue;
340 }
341 length -= 1;
342 pos = trie_unwrap!(self.skip_value(pos + 1));
343 if length <= 1 {
344 break;
345 }
346 }
347
348 if in_unit == trie_unwrap!(self.trie.get(pos)) {
349 pos += 1;
350 self.pos = Some(pos);
351 let node = trie_unwrap!(self.trie.get(pos));
352 if node >= MIN_VALUE_LEAD {
353 return self.value_result(pos);
354 }
355 TrieResult::NoValue
356 } else {
357 self.stop();
358 TrieResult::NoMatch
359 }
360 }
361
362 fn next_impl(&mut self, pos: usize, in_unit: u16) -> TrieResult {
363 let mut node = trie_unwrap!(self.trie.get(pos));
364 let mut pos = pos + 1;
365 loop {
366 if node < MIN_LINEAR_MATCH {
367 return self.branch_next(pos, node as usize, in_unit);
368 } else if node < MIN_VALUE_LEAD {
369 // Match the first of length+1 units.
370 let length = node - MIN_LINEAR_MATCH;
371 if in_unit == trie_unwrap!(self.trie.get(pos)) {
372 pos += 1;
373 if length == 0 {
374 self.remaining_match_length = None;
375 self.pos = Some(pos);
376 node = trie_unwrap!(self.trie.get(pos));
377 if node >= MIN_VALUE_LEAD {
378 return self.value_result(pos);
379 }
380 return TrieResult::NoValue;
381 }
382 self.remaining_match_length = Some(length as usize - 1);
383 self.pos = Some(pos);
384 return TrieResult::NoValue;
385 }
386 // No match
387 break;
388 } else if (node & VALUE_IS_FINAL) != 0 {
389 // No further matching units.
390 break;
391 } else {
392 // Skip intermediate value.
393 pos = skip_node_value(pos, node);
394 node &= NODE_TYPE_MASK;
395 }
396 }
397 self.stop();
398 TrieResult::NoMatch
399 }
400
401 fn stop(&mut self) {
402 self.pos = None;
403 }
404
405 #[inline(always)] // 1 call site and we want the Option to go away
406 fn jump_by_delta(&self, pos: usize) -> Option<usize> {
407 let delta = self.trie.get(pos)?;
408 let v = if delta < MIN_TWO_UNIT_DELTA_LEAD {
409 // nothing to do
410 pos + 1 + delta as usize
411 } else if delta == THREE_UNIT_DELTA_LEAD {
412 let delta =
413 ((self.trie.get(pos + 1)? as usize) << 16) | (self.trie.get(pos + 2)? as usize);
414 pos + delta + 3
415 } else {
416 let delta = ((delta - MIN_TWO_UNIT_DELTA_LEAD) as usize) << 16
417 | (self.trie.get(pos + 1)? as usize);
418 pos + delta + 2
419 };
420 Some(v)
421 }
422
423 #[inline(always)] // 1 call site and we want the Option to go away
424 fn skip_value(&self, pos: usize) -> Option<usize> {
425 let lead_unit = self.trie.get(pos)?;
426 Some(skip_value(pos + 1, lead_unit & 0x7fff))
427 }
428
429 #[inline(always)] // 1 call site and we want the Option to go away
430 fn skip_delta(&self, pos: usize) -> Option<usize> {
431 let delta = self.trie.get(pos)?;
432 let v = if delta < MIN_TWO_UNIT_DELTA_LEAD {
433 pos + 1
434 } else if delta == THREE_UNIT_DELTA_LEAD {
435 pos + 3
436 } else {
437 pos + 2
438 };
439 Some(v)
440 }
441
442 fn value_result(&self, pos: usize) -> TrieResult {
443 match self.get_value(pos) {
444 Some(result) => result,
445 None => {
446 // Unexpected
447 debug_assert!(false);
448 TrieResult::NoMatch
449 }
450 }
451 }
452
453 #[inline(always)] // 1 call site and we want the Option to go away
454 fn get_value(&self, pos: usize) -> Option<TrieResult> {
455 let lead_unit = self.trie.get(pos)?;
456 if lead_unit & VALUE_IS_FINAL == VALUE_IS_FINAL {
457 self.read_value(pos + 1, lead_unit & 0x7fff)
458 .map(TrieResult::FinalValue)
459 } else {
460 self.read_node_value(pos + 1, lead_unit)
461 .map(TrieResult::Intermediate)
462 }
463 }
464
465 #[inline(always)] // 1 call site and we want the Option to go away
466 fn read_value(&self, pos: usize, lead_unit: u16) -> Option<i32> {
467 let v = if lead_unit < MIN_TWO_UNIT_VALUE_LEAD {
468 lead_unit.into()
469 } else if lead_unit < THREE_UNIT_VALUE_LEAD {
470 ((lead_unit - MIN_TWO_UNIT_VALUE_LEAD) as i32) << 16 | self.trie.get(pos)? as i32
471 } else {
472 (self.trie.get(pos)? as i32) << 16 | self.trie.get(pos + 1)? as i32
473 };
474 Some(v)
475 }
476
477 #[inline(always)] // 1 call site and we want the Option to go away
478 fn read_node_value(&self, pos: usize, lead_unit: u16) -> Option<i32> {
479 let v = if lead_unit < (MIN_TWO_UNIT_NODE_VALUE_LEAD) {
480 ((lead_unit >> 6) - 1).into()
481 } else if lead_unit < THREE_UNIT_NODE_VALUE_LEAD {
482 (((lead_unit & 0x7fc0) - MIN_TWO_UNIT_NODE_VALUE_LEAD) as i32) << 10
483 | self.trie.get(pos)? as i32
484 } else {
485 (self.trie.get(pos)? as i32) << 16 | self.trie.get(pos + 1)? as i32
486 };
487 Some(v)
488 }
489}