zerovec/varzerovec/
components.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::ule::*;
6use alloc::boxed::Box;
7use alloc::format;
8use alloc::string::String;
9use alloc::vec::Vec;
10use core::cmp::Ordering;
11use core::convert::TryFrom;
12use core::marker::PhantomData;
13use core::ops::Range;
14
15// Also used by owned.rs
16pub(super) const LENGTH_WIDTH: usize = 4;
17pub(super) const METADATA_WIDTH: usize = 0;
18pub(super) const MAX_LENGTH: usize = u32::MAX as usize;
19pub(super) const MAX_INDEX: usize = u32::MAX as usize;
20
21/// This trait allows switching between different possible internal
22/// representations of VarZeroVec.
23///
24/// Currently this crate supports two formats: [`Index16`] and [`Index32`],
25/// with [`Index16`] being the default for all [`VarZeroVec`](super::VarZeroVec)
26/// types unless explicitly specified otherwise.
27///
28/// Do not implement this trait, its internals may be changed in the future,
29/// and all of its associated items are hidden from the docs.
30#[allow(clippy::missing_safety_doc)] // no safety section for you, don't implement this trait period
31pub unsafe trait VarZeroVecFormat: 'static + Sized {
32    #[doc(hidden)]
33    const INDEX_WIDTH: usize;
34    #[doc(hidden)]
35    const MAX_VALUE: u32;
36    /// This is always `RawBytesULE<Self::INDEX_WIDTH>` however
37    /// Rust does not currently support using associated constants in const
38    /// generics
39    #[doc(hidden)]
40    type RawBytes: ULE;
41
42    // various conversions because RawBytes is an associated constant now
43    #[doc(hidden)]
44    fn rawbytes_to_usize(raw: Self::RawBytes) -> usize;
45    #[doc(hidden)]
46    fn usize_to_rawbytes(u: usize) -> Self::RawBytes;
47
48    #[doc(hidden)]
49    fn rawbytes_from_byte_slice_unchecked_mut(bytes: &mut [u8]) -> &mut [Self::RawBytes];
50}
51
52/// This is a [`VarZeroVecFormat`] that stores u16s in the index array.
53/// Will have a smaller data size, but it's more likely for larger arrays
54/// to be unrepresentable (and error on construction)
55///
56/// This is the default index size used by all [`VarZeroVec`](super::VarZeroVec) types.
57#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
58#[allow(clippy::exhaustive_structs)] // marker
59pub struct Index16;
60
61/// This is a [`VarZeroVecFormat`] that stores u32s in the index array.
62/// Will have a larger data size, but will support large arrays without
63/// problems.
64#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
65#[allow(clippy::exhaustive_structs)] // marker
66pub struct Index32;
67
68unsafe impl VarZeroVecFormat for Index16 {
69    const INDEX_WIDTH: usize = 2;
70    const MAX_VALUE: u32 = u16::MAX as u32;
71    type RawBytes = RawBytesULE<2>;
72    #[inline]
73    fn rawbytes_to_usize(raw: Self::RawBytes) -> usize {
74        raw.as_unsigned_int() as usize
75    }
76    #[inline]
77    fn usize_to_rawbytes(u: usize) -> Self::RawBytes {
78        (u as u16).to_unaligned()
79    }
80    #[inline]
81    fn rawbytes_from_byte_slice_unchecked_mut(bytes: &mut [u8]) -> &mut [Self::RawBytes] {
82        Self::RawBytes::from_byte_slice_unchecked_mut(bytes)
83    }
84}
85
86unsafe impl VarZeroVecFormat for Index32 {
87    const INDEX_WIDTH: usize = 4;
88    const MAX_VALUE: u32 = u32::MAX;
89    type RawBytes = RawBytesULE<4>;
90    #[inline]
91    fn rawbytes_to_usize(raw: Self::RawBytes) -> usize {
92        raw.as_unsigned_int() as usize
93    }
94    #[inline]
95    fn usize_to_rawbytes(u: usize) -> Self::RawBytes {
96        (u as u32).to_unaligned()
97    }
98    #[inline]
99    fn rawbytes_from_byte_slice_unchecked_mut(bytes: &mut [u8]) -> &mut [Self::RawBytes] {
100        Self::RawBytes::from_byte_slice_unchecked_mut(bytes)
101    }
102}
103
104/// A more parsed version of `VarZeroSlice`. This type is where most of the VarZeroVec
105/// internal representation code lies.
106///
107/// This is *basically* an `&'a [u8]` to a zero copy buffer, but split out into
108/// the buffer components. Logically this is capable of behaving as
109/// a `&'a [T::VarULE]`, but since `T::VarULE` is unsized that type does not actually
110/// exist.
111///
112/// See [`VarZeroVecComponents::parse_byte_slice()`] for information on the internal invariants involved
113#[derive(Debug)]
114pub struct VarZeroVecComponents<'a, T: ?Sized, F> {
115    /// The number of elements
116    len: u32,
117    /// The list of indices into the `things` slice
118    indices: &'a [u8],
119    /// The contiguous list of `T::VarULE`s
120    things: &'a [u8],
121    /// The original slice this was constructed from
122    entire_slice: &'a [u8],
123    marker: PhantomData<(&'a T, F)>,
124}
125
126// #[derive()] won't work here since we do not want it to be
127// bound on T: Copy
128impl<'a, T: ?Sized, F> Copy for VarZeroVecComponents<'a, T, F> {}
129impl<'a, T: ?Sized, F> Clone for VarZeroVecComponents<'a, T, F> {
130    fn clone(&self) -> Self {
131        *self
132    }
133}
134
135impl<'a, T: VarULE + ?Sized, F> Default for VarZeroVecComponents<'a, T, F> {
136    #[inline]
137    fn default() -> Self {
138        Self::new()
139    }
140}
141
142impl<'a, T: VarULE + ?Sized, F> VarZeroVecComponents<'a, T, F> {
143    #[inline]
144    pub fn new() -> Self {
145        Self {
146            len: 0,
147            indices: &[],
148            things: &[],
149            entire_slice: &[],
150            marker: PhantomData,
151        }
152    }
153}
154impl<'a, T: VarULE + ?Sized, F: VarZeroVecFormat> VarZeroVecComponents<'a, T, F> {
155    /// Construct a new VarZeroVecComponents, checking invariants about the overall buffer size:
156    ///
157    /// - There must be either zero or at least four bytes (if four, this is the "length" parsed as a usize)
158    /// - There must be at least `4*length + 4` bytes total, to form the array `indices` of indices
159    /// - `indices[i]..indices[i+1]` must index into a valid section of
160    ///   `things`, such that it parses to a `T::VarULE`
161    /// - `indices[len - 1]..things.len()` must index into a valid section of
162    ///   `things`, such that it parses to a `T::VarULE`
163    #[inline]
164    pub fn parse_byte_slice(slice: &'a [u8]) -> Result<Self, ZeroVecError> {
165        // The empty VZV is special-cased to the empty slice
166        if slice.is_empty() {
167            return Ok(VarZeroVecComponents {
168                len: 0,
169                indices: &[],
170                things: &[],
171                entire_slice: slice,
172                marker: PhantomData,
173            });
174        }
175        let len_bytes = slice
176            .get(0..LENGTH_WIDTH)
177            .ok_or(ZeroVecError::VarZeroVecFormatError)?;
178        let len_ule = RawBytesULE::<LENGTH_WIDTH>::parse_byte_slice(len_bytes)
179            .map_err(|_| ZeroVecError::VarZeroVecFormatError)?;
180
181        let len = len_ule
182            .first()
183            .ok_or(ZeroVecError::VarZeroVecFormatError)?
184            .as_unsigned_int();
185        let indices_bytes = slice
186            .get(
187                LENGTH_WIDTH + METADATA_WIDTH
188                    ..LENGTH_WIDTH + METADATA_WIDTH + F::INDEX_WIDTH * (len as usize),
189            )
190            .ok_or(ZeroVecError::VarZeroVecFormatError)?;
191        let things = slice
192            .get(F::INDEX_WIDTH * (len as usize) + LENGTH_WIDTH + METADATA_WIDTH..)
193            .ok_or(ZeroVecError::VarZeroVecFormatError)?;
194
195        let borrowed = VarZeroVecComponents {
196            len,
197            indices: indices_bytes,
198            things,
199            entire_slice: slice,
200            marker: PhantomData,
201        };
202
203        borrowed.check_indices_and_things()?;
204
205        Ok(borrowed)
206    }
207
208    /// Construct a [`VarZeroVecComponents`] from a byte slice that has previously
209    /// successfully returned a [`VarZeroVecComponents`] when passed to
210    /// [`VarZeroVecComponents::parse_byte_slice()`]. Will return the same
211    /// object as one would get from calling [`VarZeroVecComponents::parse_byte_slice()`].
212    ///
213    /// # Safety
214    /// The bytes must have previously successfully run through
215    /// [`VarZeroVecComponents::parse_byte_slice()`]
216    pub unsafe fn from_bytes_unchecked(slice: &'a [u8]) -> Self {
217        // The empty VZV is special-cased to the empty slice
218        if slice.is_empty() {
219            return VarZeroVecComponents {
220                len: 0,
221                indices: &[],
222                things: &[],
223                entire_slice: slice,
224                marker: PhantomData,
225            };
226        }
227        let len_bytes = slice.get_unchecked(0..LENGTH_WIDTH);
228        let len_ule = RawBytesULE::<LENGTH_WIDTH>::from_byte_slice_unchecked(len_bytes);
229
230        let len = len_ule.get_unchecked(0).as_unsigned_int();
231        let indices_bytes = slice.get_unchecked(
232            LENGTH_WIDTH + METADATA_WIDTH
233                ..LENGTH_WIDTH + METADATA_WIDTH + F::INDEX_WIDTH * (len as usize),
234        );
235        let things =
236            slice.get_unchecked(LENGTH_WIDTH + METADATA_WIDTH + F::INDEX_WIDTH * (len as usize)..);
237
238        VarZeroVecComponents {
239            len,
240            indices: indices_bytes,
241            things,
242            entire_slice: slice,
243            marker: PhantomData,
244        }
245    }
246
247    /// Get the number of elements in this vector
248    #[inline]
249    pub fn len(self) -> usize {
250        self.len as usize
251    }
252
253    /// Returns `true` if the vector contains no elements.
254    #[inline]
255    pub fn is_empty(self) -> bool {
256        self.indices.is_empty()
257    }
258
259    /// Get the idx'th element out of this slice. Returns `None` if out of bounds.
260    #[inline]
261    pub fn get(self, idx: usize) -> Option<&'a T> {
262        if idx >= self.len() {
263            return None;
264        }
265        Some(unsafe { self.get_unchecked(idx) })
266    }
267
268    /// Get the idx'th element out of this slice. Does not bounds check.
269    ///
270    /// Safety:
271    /// - `idx` must be in bounds (`idx < self.len()`)
272    #[inline]
273    pub(crate) unsafe fn get_unchecked(self, idx: usize) -> &'a T {
274        let range = self.get_things_range(idx);
275        let things_slice = self.things.get_unchecked(range);
276        T::from_byte_slice_unchecked(things_slice)
277    }
278
279    /// Get the range in `things` for the element at `idx`. Does not bounds check.
280    ///
281    /// Safety:
282    /// - `idx` must be in bounds (`idx < self.len()`)
283    #[inline]
284    unsafe fn get_things_range(self, idx: usize) -> Range<usize> {
285        let start = F::rawbytes_to_usize(*self.indices_slice().get_unchecked(idx));
286        let end = if idx + 1 == self.len() {
287            self.things.len()
288        } else {
289            F::rawbytes_to_usize(*self.indices_slice().get_unchecked(idx + 1))
290        };
291        debug_assert!(start <= end);
292        start..end
293    }
294
295    /// Get the range in `entire_slice` for the element at `idx`. Does not bounds check.
296    ///
297    /// Safety:
298    /// - `idx` must be in bounds (`idx < self.len()`)
299    #[inline]
300    pub(crate) unsafe fn get_range(self, idx: usize) -> Range<usize> {
301        let range = self.get_things_range(idx);
302        let offset = (self.things as *const [u8] as *const u8)
303            .offset_from(self.entire_slice as *const [u8] as *const u8)
304            as usize;
305        range.start + offset..range.end + offset
306    }
307
308    /// Check the internal invariants of VarZeroVecComponents:
309    ///
310    /// - `indices[i]..indices[i+1]` must index into a valid section of
311    ///   `things`, such that it parses to a `T::VarULE`
312    /// - `indices[len - 1]..things.len()` must index into a valid section of
313    ///   `things`, such that it parses to a `T::VarULE`
314    /// - `indices` is monotonically increasing
315    ///
316    /// This method is NOT allowed to call any other methods on VarZeroVecComponents since all other methods
317    /// assume that the slice has been passed through check_indices_and_things
318    #[inline]
319    #[allow(clippy::len_zero)] // more explicit to enforce safety invariants
320    fn check_indices_and_things(self) -> Result<(), ZeroVecError> {
321        assert_eq!(self.len(), self.indices_slice().len());
322        if self.len() == 0 {
323            if self.things.len() > 0 {
324                return Err(ZeroVecError::VarZeroVecFormatError);
325            } else {
326                return Ok(());
327            }
328        }
329        // Safety: i is in bounds (assertion above)
330        let mut start = F::rawbytes_to_usize(unsafe { *self.indices_slice().get_unchecked(0) });
331        if start != 0 {
332            return Err(ZeroVecError::VarZeroVecFormatError);
333        }
334        for i in 0..self.len() {
335            let end = if i == self.len() - 1 {
336                self.things.len()
337            } else {
338                // Safety: i+1 is in bounds (assertion above)
339                F::rawbytes_to_usize(unsafe { *self.indices_slice().get_unchecked(i + 1) })
340            };
341            if start > end {
342                return Err(ZeroVecError::VarZeroVecFormatError);
343            }
344            if end > self.things.len() {
345                return Err(ZeroVecError::VarZeroVecFormatError);
346            }
347            // Safety: start..end is a valid range in self.things
348            let bytes = unsafe { self.things.get_unchecked(start..end) };
349            T::parse_byte_slice(bytes)?;
350            start = end;
351        }
352        Ok(())
353    }
354
355    /// Create an iterator over the Ts contained in VarZeroVecComponents
356    #[inline]
357    pub fn iter(self) -> impl Iterator<Item = &'a T> {
358        self.indices_slice()
359            .iter()
360            .copied()
361            .map(F::rawbytes_to_usize)
362            .zip(
363                self.indices_slice()
364                    .iter()
365                    .copied()
366                    .map(F::rawbytes_to_usize)
367                    .skip(1)
368                    .chain([self.things.len()]),
369            )
370            .map(move |(start, end)| unsafe { self.things.get_unchecked(start..end) })
371            .map(|bytes| unsafe { T::from_byte_slice_unchecked(bytes) })
372    }
373
374    pub fn to_vec(self) -> Vec<Box<T>> {
375        self.iter().map(T::to_boxed).collect()
376    }
377
378    #[inline]
379    fn indices_slice(&self) -> &'a [F::RawBytes] {
380        unsafe { F::RawBytes::from_byte_slice_unchecked(self.indices) }
381    }
382
383    // Dump a debuggable representation of this type
384    #[allow(unused)] // useful for debugging
385    pub(crate) fn dump(&self) -> String {
386        let indices = self
387            .indices_slice()
388            .iter()
389            .copied()
390            .map(F::rawbytes_to_usize)
391            .collect::<Vec<_>>();
392        format!("VarZeroVecComponents {{ indices: {indices:?} }}")
393    }
394}
395
396impl<'a, T, F> VarZeroVecComponents<'a, T, F>
397where
398    T: VarULE,
399    T: ?Sized,
400    T: Ord,
401    F: VarZeroVecFormat,
402{
403    /// Binary searches a sorted `VarZeroVecComponents<T>` for the given element. For more information, see
404    /// the primitive function [`binary_search`](slice::binary_search).
405    pub fn binary_search(&self, needle: &T) -> Result<usize, usize> {
406        self.binary_search_impl(|probe| probe.cmp(needle), self.indices_slice())
407    }
408
409    pub fn binary_search_in_range(
410        &self,
411        needle: &T,
412        range: Range<usize>,
413    ) -> Option<Result<usize, usize>> {
414        let indices_slice = self.indices_slice().get(range)?;
415        Some(self.binary_search_impl(|probe| probe.cmp(needle), indices_slice))
416    }
417}
418
419impl<'a, T, F> VarZeroVecComponents<'a, T, F>
420where
421    T: VarULE,
422    T: ?Sized,
423    F: VarZeroVecFormat,
424{
425    /// Binary searches a sorted `VarZeroVecComponents<T>` for the given predicate. For more information, see
426    /// the primitive function [`binary_search_by`](slice::binary_search_by).
427    pub fn binary_search_by(&self, predicate: impl FnMut(&T) -> Ordering) -> Result<usize, usize> {
428        self.binary_search_impl(predicate, self.indices_slice())
429    }
430
431    pub fn binary_search_in_range_by(
432        &self,
433        predicate: impl FnMut(&T) -> Ordering,
434        range: Range<usize>,
435    ) -> Option<Result<usize, usize>> {
436        let indices_slice = self.indices_slice().get(range)?;
437        Some(self.binary_search_impl(predicate, indices_slice))
438    }
439
440    /// Binary searches a sorted `VarZeroVecComponents<T>` with the given predicate. For more information, see
441    /// the primitive function [`binary_search`](slice::binary_search).
442    fn binary_search_impl(
443        &self,
444        mut predicate: impl FnMut(&T) -> Ordering,
445        indices_slice: &[F::RawBytes],
446    ) -> Result<usize, usize> {
447        // This code is an absolute atrocity. This code is not a place of honor. This
448        // code is known to the State of California to cause cancer.
449        //
450        // Unfortunately, the stdlib's `binary_search*` functions can only operate on slices.
451        // We do not have a slice. We have something we can .get() and index on, but that is not
452        // a slice.
453        //
454        // The `binary_search*` functions also do not have a variant where they give you the element's
455        // index, which we could otherwise use to directly index `self`.
456        // We do have `self.indices`, but these are indices into a byte buffer, which cannot in
457        // isolation be used to recoup the logical index of the element they refer to.
458        //
459        // However, `binary_search_by()` provides references to the elements of the slice being iterated.
460        // Since the layout of Rust slices is well-defined, we can do pointer arithmetic on these references
461        // to obtain the index being used by the search.
462        //
463        // It's worth noting that the slice we choose to search is irrelevant, as long as it has the appropriate
464        // length. `self.indices` is defined to have length `self.len()`, so it is convenient to use
465        // here and does not require additional allocations.
466        //
467        // The alternative to doing this is to implement our own binary search. This is significantly less fun.
468
469        // Note: We always use zero_index relative to the whole indices array, even if we are
470        // only searching a subslice of it.
471        let zero_index = self.indices.as_ptr() as *const _ as usize;
472        indices_slice.binary_search_by(|probe: &_| {
473            // `self.indices` is a vec of unaligned F::INDEX_WIDTH values, so we divide by F::INDEX_WIDTH
474            // to get the actual index
475            let index = (probe as *const _ as usize - zero_index) / F::INDEX_WIDTH;
476            // safety: we know this is in bounds
477            let actual_probe = unsafe { self.get_unchecked(index) };
478            predicate(actual_probe)
479        })
480    }
481}
482
483/// Collects the bytes for a VarZeroSlice into a Vec.
484pub fn get_serializable_bytes_non_empty<T, A, F>(elements: &[A]) -> Option<Vec<u8>>
485where
486    T: VarULE + ?Sized,
487    A: EncodeAsVarULE<T>,
488    F: VarZeroVecFormat,
489{
490    debug_assert!(!elements.is_empty());
491    let len = compute_serializable_len::<T, A, F>(elements)?;
492    debug_assert!(len >= LENGTH_WIDTH as u32);
493    let mut output: Vec<u8> = alloc::vec![0; len as usize];
494    write_serializable_bytes::<T, A, F>(elements, &mut output);
495    Some(output)
496}
497
498/// Writes the bytes for a VarZeroSlice into an output buffer.
499///
500/// Every byte in the buffer will be initialized after calling this function.
501///
502/// # Panics
503///
504/// Panics if the buffer is not exactly the correct length.
505pub fn write_serializable_bytes<T, A, F>(elements: &[A], output: &mut [u8])
506where
507    T: VarULE + ?Sized,
508    A: EncodeAsVarULE<T>,
509    F: VarZeroVecFormat,
510{
511    assert!(elements.len() <= MAX_LENGTH);
512    let num_elements_bytes = elements.len().to_le_bytes();
513    #[allow(clippy::indexing_slicing)] // Function contract allows panicky behavior
514    output[0..LENGTH_WIDTH].copy_from_slice(&num_elements_bytes[0..LENGTH_WIDTH]);
515
516    // idx_offset = offset from the start of the buffer for the next index
517    let mut idx_offset: usize = LENGTH_WIDTH + METADATA_WIDTH;
518    // first_dat_offset = offset from the start of the buffer of the first data block
519    let first_dat_offset: usize = idx_offset + elements.len() * F::INDEX_WIDTH;
520    // dat_offset = offset from the start of the buffer of the next data block
521    let mut dat_offset: usize = first_dat_offset;
522
523    for element in elements.iter() {
524        let element_len = element.encode_var_ule_len();
525
526        let idx_limit = idx_offset + F::INDEX_WIDTH;
527        #[allow(clippy::indexing_slicing)] // Function contract allows panicky behavior
528        let idx_slice = &mut output[idx_offset..idx_limit];
529        // VZV expects data offsets to be stored relative to the first data block
530        let idx = dat_offset - first_dat_offset;
531        assert!(idx <= MAX_INDEX);
532        #[allow(clippy::indexing_slicing)] // this function is explicitly panicky
533        idx_slice.copy_from_slice(&idx.to_le_bytes()[..F::INDEX_WIDTH]);
534
535        let dat_limit = dat_offset + element_len;
536        #[allow(clippy::indexing_slicing)] // Function contract allows panicky behavior
537        let dat_slice = &mut output[dat_offset..dat_limit];
538        element.encode_var_ule_write(dat_slice);
539        debug_assert_eq!(T::validate_byte_slice(dat_slice), Ok(()));
540
541        idx_offset = idx_limit;
542        dat_offset = dat_limit;
543    }
544
545    debug_assert_eq!(
546        idx_offset,
547        LENGTH_WIDTH + METADATA_WIDTH + F::INDEX_WIDTH * elements.len()
548    );
549    assert_eq!(dat_offset, output.len());
550}
551
552pub fn compute_serializable_len<T, A, F>(elements: &[A]) -> Option<u32>
553where
554    T: VarULE + ?Sized,
555    A: EncodeAsVarULE<T>,
556    F: VarZeroVecFormat,
557{
558    let idx_len: u32 = u32::try_from(elements.len())
559        .ok()?
560        .checked_mul(F::INDEX_WIDTH as u32)?
561        .checked_add(LENGTH_WIDTH as u32)?
562        .checked_add(METADATA_WIDTH as u32)?;
563    let data_len: u32 = elements
564        .iter()
565        .map(|v| u32::try_from(v.encode_var_ule_len()).ok())
566        .try_fold(0u32, |s, v| s.checked_add(v?))?;
567    let ret = idx_len.checked_add(data_len);
568    if let Some(r) = ret {
569        if r >= F::MAX_VALUE {
570            return None;
571        }
572    }
573    ret
574}