zerotrie/
varint.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Varint spec for ZeroTrie:
6//!
7//! - Lead byte: top M (2 or 3) bits are metadata; next is varint extender; rest is value
8//! - Trail bytes: top bit is varint extender; rest are low bits of value
9//! - Guaranteed uniqueness of varint by adding "latent value" for each extender byte
10//! - No maximum, but high bits will be dropped if they don't fit in the platform's `usize`
11//!
12//! This is best shown by examples.
13//!
14//! ```txt
15//! xxx0'1010 = 10
16//! xxx0'1111 = 15 (largest single-byte value with M=3)
17//! xxx1'0000 0000'0000 must be 16 (smallest two-byte value with M=3)
18//! xxx1'0000 0000'0001 = 17
19//! xxx1'1111 0111'1111 = 2063 (largest two-byte value with M=3)
20//! xxx1'0000 1000'0000 0000'0000 must be 2064 (smallest three-byte value with M=3)
21//! xxx1'0000 1000'0000 0000'0001 = 2065
22//! ```
23//!
24//! The latent values by number of bytes for M=3 are:
25//!
26//! - 1 byte: 0
27//! - 2 bytes: 16 = 0x10 = 0b10000
28//! - 3 bytes: 2064 = 0x810 = 0b100000010000
29//! - 4 bytes: 264208 = 0x40810 = 0b1000000100000010000
30//! - 5 bytes: 33818640 = 0x2040810 = 0b10000001000000100000010000
31//! - …
32//!
33//! For M=2, the latent values are:
34//!
35//! - 1 byte: 0
36//! - 2 bytes: 32 = 0x20 = 0b100000
37//! - 3 bytes: 4128 = 0x1020 = 0b1000000100000
38//! - 4 bytes: 524320 = 0x81020 = 0b10000001000000100000
39//! - 5 bytes: 67637280 = 0x4081020 = 0b100000010000001000000100000
40//! - …
41
42use crate::builder::konst::ConstArrayBuilder;
43
44#[cfg(feature = "alloc")]
45use crate::builder::nonconst::TrieBuilderStore;
46
47/// Reads a varint with 2 bits of metadata in the lead byte.
48///
49/// Returns the varint value and a subslice of `remainder` with the varint bytes removed.
50///
51/// If the varint spills off the end of the slice, a debug assertion will fail,
52/// and the function will return the value up to that point.
53pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> (usize, &[u8]) {
54    let mut value = (start & 0b00011111) as usize;
55    let mut remainder = remainder;
56    if (start & 0b00100000) != 0 {
57        loop {
58            let next;
59            (next, remainder) = match remainder.split_first() {
    Some(x) => x,
    None => {
        if true {
            if !false {
                {
                    ::core::panicking::panic_fmt(format_args!("invalid varint"));
                }
            };
        };
        break;
    }
}debug_unwrap!(remainder.split_first(), break, "invalid varint");
60            // Note: value << 7 could drop high bits. The first addition can't overflow.
61            // The second addition could overflow; in such a case we just inform the
62            // developer via the debug assertion.
63            value = (value << 7) + ((*next & 0b01111111) as usize) + 32;
64            if (*next & 0b10000000) == 0 {
65                break;
66            }
67        }
68    }
69    (value, remainder)
70}
71
72/// Reads a varint with 3 bits of metadata in the lead byte.
73///
74/// Returns the varint value and a subslice of `remainder` with the varint bytes removed.
75///
76/// If the varint spills off the end of the slice, a debug assertion will fail,
77/// and the function will return the value up to that point.
78pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> (usize, &[u8]) {
79    let mut value = (start & 0b00001111) as usize;
80    let mut remainder = remainder;
81    if (start & 0b00010000) != 0 {
82        loop {
83            let next;
84            (next, remainder) = match remainder.split_first() {
    Some(x) => x,
    None => {
        if true {
            if !false {
                {
                    ::core::panicking::panic_fmt(format_args!("invalid varint"));
                }
            };
        };
        break;
    }
}debug_unwrap!(remainder.split_first(), break, "invalid varint");
85            // Note: value << 7 could drop high bits. The first addition can't overflow.
86            // The second addition could overflow; in such a case we just inform the
87            // developer via the debug assertion.
88            value = (value << 7) + ((*next & 0b01111111) as usize) + 16;
89            if (*next & 0b10000000) == 0 {
90                break;
91            }
92        }
93    }
94    (value, remainder)
95}
96
97/// Reads and removes a varint with 3 bits of metadata from a [`TrieBuilderStore`].
98///
99/// Returns the varint value.
100#[cfg(feature = "alloc")]
101pub(crate) fn try_read_varint_meta3_from_tstore<S: TrieBuilderStore>(
102    start: u8,
103    remainder: &mut S,
104) -> Option<usize> {
105    let mut value = (start & 0b00001111) as usize;
106    if (start & 0b00010000) != 0 {
107        loop {
108            let next = remainder.atbs_pop_front()?;
109            // Note: value << 7 could drop high bits. The first addition can't overflow.
110            // The second addition could overflow; in such a case we just inform the
111            // developer via the debug assertion.
112            value = (value << 7) + ((next & 0b01111111) as usize) + 16;
113            if (next & 0b10000000) == 0 {
114                break;
115            }
116        }
117    }
118    Some(value)
119}
120
121#[cfg(test)]
122const MAX_VARINT: usize = usize::MAX;
123
124// *Upper Bound:* Each trail byte stores 7 bits of data, plus the latent value.
125// Add an extra 1 since the lead byte holds only 5 bits of data.
126const MAX_VARINT_LENGTH: usize = 1 + core::mem::size_of::<usize>() * 8 / 7;
127
128/// Returns a new [`ConstArrayBuilder`] containing a varint with 2 bits of metadata.
129#[allow(clippy::indexing_slicing)] // Okay so long as MAX_VARINT_LENGTH is correct
130pub(crate) const fn write_varint_meta2(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
131    let mut result = [0; MAX_VARINT_LENGTH];
132    let mut i = MAX_VARINT_LENGTH - 1;
133    let mut value = value;
134    let mut last = true;
135    loop {
136        if value < 32 {
137            result[i] = value as u8;
138            if !last {
139                result[i] |= 0b00100000;
140            }
141            break;
142        }
143        value -= 32;
144        result[i] = (value as u8) & 0b01111111;
145        if !last {
146            result[i] |= 0b10000000;
147        } else {
148            last = false;
149        }
150        value >>= 7;
151        i -= 1;
152    }
153    // The bytes are from i to the end.
154    ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH)
155}
156
157/// Returns a new [`ConstArrayBuilder`] containing a varint with 3 bits of metadata.
158#[allow(clippy::indexing_slicing)] // Okay so long as MAX_VARINT_LENGTH is correct
159pub(crate) const fn write_varint_meta3(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
160    let mut result = [0; MAX_VARINT_LENGTH];
161    let mut i = MAX_VARINT_LENGTH - 1;
162    let mut value = value;
163    let mut last = true;
164    loop {
165        if value < 16 {
166            result[i] = value as u8;
167            if !last {
168                result[i] |= 0b00010000;
169            }
170            break;
171        }
172        value -= 16;
173        result[i] = (value as u8) & 0b01111111;
174        if !last {
175            result[i] |= 0b10000000;
176        } else {
177            last = false;
178        }
179        value >>= 7;
180        i -= 1;
181    }
182    // The bytes are from i to the end.
183    ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH)
184}
185
186/// A secondary implementation that separates the latent value while computing the varint.
187#[cfg(test)]
188pub(crate) const fn write_varint_reference(
189    value: usize,
190) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
191    let mut result = [0; MAX_VARINT_LENGTH];
192    if value < 32 {
193        result[0] = value as u8;
194        return ConstArrayBuilder::from_manual_slice(result, 0, 1);
195    }
196    result[0] = 32;
197    let mut latent = 32;
198    let mut steps = 2;
199    loop {
200        let next_latent = (latent << 7) + 32;
201        if value < next_latent || next_latent == latent {
202            break;
203        }
204        latent = next_latent;
205        steps += 1;
206    }
207    let mut value = value - latent;
208    let mut i = steps;
209    while i > 0 {
210        i -= 1;
211        result[i] |= (value as u8) & 0b01111111;
212        value >>= 7;
213        if i > 0 && i < steps - 1 {
214            result[i] |= 0b10000000;
215        }
216    }
217    // The bytes are from 0 to `steps`.
218    ConstArrayBuilder::from_manual_slice(result, 0, steps)
219}
220
221#[cfg(test)]
222mod tests {
223    use super::*;
224
225    #[derive(Debug)]
226    struct TestCase<'a> {
227        bytes: &'a [u8],
228        remainder: &'a [u8],
229        value: usize,
230    }
231    static CASES: &[TestCase] = &[
232        TestCase {
233            bytes: &[0b00000000],
234            remainder: &[],
235            value: 0,
236        },
237        TestCase {
238            bytes: &[0b00001010],
239            remainder: &[],
240            value: 10,
241        },
242        TestCase {
243            bytes: &[0b00011111],
244            remainder: &[],
245            value: 31,
246        },
247        TestCase {
248            bytes: &[0b00011111, 0b10101010],
249            remainder: &[0b10101010],
250            value: 31,
251        },
252        TestCase {
253            bytes: &[0b00100000, 0b00000000],
254            remainder: &[],
255            value: 32,
256        },
257        TestCase {
258            bytes: &[0b00100000, 0b00000001],
259            remainder: &[],
260            value: 33,
261        },
262        TestCase {
263            bytes: &[0b00100000, 0b00100000],
264            remainder: &[],
265            value: 64,
266        },
267        TestCase {
268            bytes: &[0x20, 0x44],
269            remainder: &[],
270            value: 100,
271        },
272        TestCase {
273            bytes: &[0b00100000, 0b01111111],
274            remainder: &[],
275            value: 159,
276        },
277        TestCase {
278            bytes: &[0b00100001, 0b00000000],
279            remainder: &[],
280            value: 160,
281        },
282        TestCase {
283            bytes: &[0b00100001, 0b00000001],
284            remainder: &[],
285            value: 161,
286        },
287        TestCase {
288            bytes: &[0x23, 0x54],
289            remainder: &[],
290            value: 500,
291        },
292        TestCase {
293            bytes: &[0b00111111, 0b01111111],
294            remainder: &[],
295            value: 4127, // 32 + (1 << 12) - 1
296        },
297        TestCase {
298            bytes: &[0b00100000, 0b10000000, 0b00000000],
299            remainder: &[],
300            value: 4128, // 32 + (1 << 12)
301        },
302        TestCase {
303            bytes: &[0b00100000, 0b10000000, 0b00000001],
304            remainder: &[],
305            value: 4129, // 32 + (1 << 12) + 1
306        },
307        TestCase {
308            bytes: &[0b00100000, 0b10000000, 0b01111111],
309            remainder: &[],
310            value: 4255, // 32 + (1 << 12) + 127
311        },
312        TestCase {
313            bytes: &[0b00100000, 0b10000001, 0b00000000],
314            remainder: &[],
315            value: 4256, // 32 + (1 << 12) + 128
316        },
317        TestCase {
318            bytes: &[0b00100000, 0b10000001, 0b00000001],
319            remainder: &[],
320            value: 4257, // 32 + (1 << 12) + 129
321        },
322        TestCase {
323            bytes: &[0x20, 0x86, 0x68],
324            remainder: &[],
325            value: 5000,
326        },
327        TestCase {
328            bytes: &[0b00100000, 0b11111111, 0b01111111],
329            remainder: &[],
330            value: 20511, // 32 + (1 << 12) + (1 << 14) - 1
331        },
332        TestCase {
333            bytes: &[0b00100001, 0b10000000, 0b00000000],
334            remainder: &[],
335            value: 20512, // 32 + (1 << 12) + (1 << 14)
336        },
337        TestCase {
338            bytes: &[0b00111111, 0b11111111, 0b01111111],
339            remainder: &[],
340            value: 528415, // 32 + (1 << 12) + (1 << 19) - 1
341        },
342        TestCase {
343            bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000000],
344            remainder: &[],
345            value: 528416, // 32 + (1 << 12) + (1 << 19)
346        },
347        TestCase {
348            bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000001],
349            remainder: &[],
350            value: 528417, // 32 + (1 << 12) + (1 << 19) + 1
351        },
352        TestCase {
353            bytes: &[0b00111111, 0b11111111, 0b11111111, 0b01111111],
354            remainder: &[],
355            value: 67637279, // 32 + (1 << 12) + (1 << 19) + (1 << 26) - 1
356        },
357        TestCase {
358            bytes: &[0b00100000, 0b10000000, 0b10000000, 0b10000000, 0b00000000],
359            remainder: &[],
360            value: 67637280, // 32 + (1 << 12) + (1 << 19) + (1 << 26)
361        },
362    ];
363
364    #[test]
365    fn test_read() {
366        for cas in CASES {
367            let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]);
368            assert_eq!(recovered, (cas.value, cas.remainder), "{cas:?}");
369        }
370    }
371
372    #[test]
373    fn test_read_write() {
374        for cas in CASES {
375            let reference_bytes = write_varint_reference(cas.value);
376            assert_eq!(
377                reference_bytes.len(),
378                cas.bytes.len() - cas.remainder.len(),
379                "{cas:?}"
380            );
381            assert_eq!(
382                reference_bytes.as_slice(),
383                &cas.bytes[0..reference_bytes.len()],
384                "{cas:?}"
385            );
386            let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]);
387            assert_eq!(recovered, (cas.value, cas.remainder), "{cas:?}");
388            let write_bytes = write_varint_meta2(cas.value);
389            assert_eq!(
390                reference_bytes.as_slice(),
391                write_bytes.as_slice(),
392                "{cas:?}"
393            );
394        }
395    }
396
397    #[test]
398    fn test_roundtrip() {
399        let mut i = 0usize;
400        while i < MAX_VARINT {
401            let bytes = write_varint_meta2(i);
402            let recovered = read_varint_meta2(bytes.as_slice()[0], &bytes.as_slice()[1..]);
403            assert_eq!(i, recovered.0, "{:?}", bytes.as_slice());
404            i <<= 1;
405            i += 1;
406        }
407    }
408
409    #[test]
410    fn test_extended_roundtrip() {
411        let mut i = 0usize;
412        while i < MAX_VARINT {
413            let bytes = write_varint_meta3(i);
414            let recovered = read_varint_meta3(bytes.as_slice()[0], &bytes.as_slice()[1..]);
415            assert_eq!(i, recovered.0, "{:?}", bytes.as_slice());
416            i <<= 1;
417            i += 1;
418        }
419    }
420
421    #[test]
422    fn test_max() {
423        let reference_bytes = write_varint_reference(MAX_VARINT);
424        let write_bytes = write_varint_meta2(MAX_VARINT);
425        assert_eq!(reference_bytes.len(), MAX_VARINT_LENGTH);
426        assert_eq!(reference_bytes.as_slice(), write_bytes.as_slice());
427        let subarray = write_bytes
428            .as_const_slice()
429            .get_subslice_or_panic(1, write_bytes.len());
430        let (recovered_value, remainder) = read_varint_meta2(
431            *write_bytes.as_const_slice().first().unwrap(),
432            subarray.as_slice(),
433        );
434        assert!(remainder.is_empty());
435        assert_eq!(recovered_value, MAX_VARINT);
436        #[cfg(target_pointer_width = "64")]
437        assert_eq!(
438            write_bytes.as_slice(),
439            &[
440                0b00100001, //
441                0b11011111, //
442                0b11011111, //
443                0b11011111, //
444                0b11011111, //
445                0b11011111, //
446                0b11011111, //
447                0b11011111, //
448                0b11011111, //
449                0b01011111, //
450            ]
451        );
452        #[cfg(target_pointer_width = "32")]
453        assert_eq!(
454            write_bytes.as_slice(),
455            &[
456                0b00101111, //
457                0b11011111, //
458                0b11011111, //
459                0b11011111, //
460                0b01011111, //
461            ]
462        );
463    }
464
465    #[test]
466    fn text_extended_max() {
467        let write_bytes = write_varint_meta3(MAX_VARINT);
468        assert_eq!(write_bytes.len(), MAX_VARINT_LENGTH);
469        let (lead, trailing) = write_bytes.as_slice().split_first().unwrap();
470        let (recovered_value, remainder) = read_varint_meta3(*lead, trailing);
471        assert!(remainder.is_empty());
472        assert_eq!(recovered_value, MAX_VARINT);
473        #[cfg(target_pointer_width = "64")]
474        assert_eq!(
475            write_bytes.as_slice(),
476            &[
477                0b00010001, //
478                0b11101111, //
479                0b11101111, //
480                0b11101111, //
481                0b11101111, //
482                0b11101111, //
483                0b11101111, //
484                0b11101111, //
485                0b11101111, //
486                0b01101111, //
487            ]
488        );
489        #[cfg(target_pointer_width = "32")]
490        assert_eq!(
491            write_bytes.as_slice(),
492            &[
493                0b00011111, //
494                0b11101111, //
495                0b11101111, //
496                0b11101111, //
497                0b01101111, //
498            ]
499        );
500    }
501
502    #[test]
503    fn test_latent_values() {
504        // Same values documented in the module docs: M=2
505        let m2 = read_varint_meta2;
506        assert_eq!(m2(0, &[]).0, 0);
507        assert_eq!(m2(0x20, &[0x00]).0, 32);
508        assert_eq!(m2(0x20, &[0x80, 0x00]).0, 4128);
509        assert_eq!(m2(0x20, &[0x80, 0x80, 0x00]).0, 528416);
510        assert_eq!(m2(0x20, &[0x80, 0x80, 0x80, 0x00]).0, 67637280);
511
512        // Same values documented in the module docs: M=3
513        let m3 = read_varint_meta3;
514        assert_eq!(m3(0, &[]).0, 0);
515        assert_eq!(m3(0x10, &[0x00]).0, 16);
516        assert_eq!(m3(0x10, &[0x80, 0x00]).0, 2064);
517        assert_eq!(m3(0x10, &[0x80, 0x80, 0x00]).0, 264208);
518        assert_eq!(m3(0x10, &[0x80, 0x80, 0x80, 0x00]).0, 33818640);
519    }
520}