zerotrie/builder/konst/
builder.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use super::super::branch_meta::BranchMeta;
6use super::super::bytestr::ByteStr;
7use super::store::const_for_each;
8use super::store::ConstArrayBuilder;
9use super::store::ConstLengthsStack;
10use super::store::ConstSlice;
11use crate::error::ZeroTrieBuildError;
12use crate::varint;
13
14/// A low-level builder for ZeroTrieSimpleAscii. Works in const contexts.
15///
16/// All methods that grow the trie will panic if the capacity N is not enough.
17pub(crate) struct ZeroTrieBuilderConst<const N: usize> {
18    data: ConstArrayBuilder<N, u8>,
19}
20
21impl<const N: usize> ZeroTrieBuilderConst<N> {
22    /// Non-const function that returns the current trie data as a slice.
23    #[cfg(feature = "litemap")]
24    pub fn as_bytes(&self) -> &[u8] {
25        self.data.as_const_slice().as_slice()
26    }
27
28    /// Returns the trie data, panicking if the buffer is the wrong size.
29    pub const fn build_or_panic(self) -> [u8; N] {
30        self.data.const_build_or_panic()
31    }
32
33    /// Creates a new empty builder.
34    pub const fn new() -> Self {
35        Self {
36            data: ConstArrayBuilder::new_empty([0; N], N),
37        }
38    }
39
40    /// Prepends an ASCII node to the front of the builder. Returns the new builder
41    /// and the delta in length, which is always 1.
42    #[must_use]
43    const fn prepend_ascii(self, ascii: u8) -> (Self, usize) {
44        if ascii >= 128 {
45            {
    ::core::panicking::panic_fmt(format_args!("Non-ASCII not supported in ZeroTrieSimpleAscii"));
};panic!("Non-ASCII not supported in ZeroTrieSimpleAscii");
46        }
47        let data = self.data.const_push_front_or_panic(ascii);
48        (Self { data }, 1)
49    }
50
51    /// Prepends a value node to the front of the builder. Returns the new builder
52    /// and the delta in length, which depends on the size of the varint.
53    #[must_use]
54    const fn prepend_value(self, value: usize) -> (Self, usize) {
55        let mut data = self.data;
56        let varint_array = varint::write_varint_meta3(value);
57        // Can panic (as documented in class docs):
58        data = data.const_extend_front_or_panic(varint_array.as_const_slice());
59        // Shouldn't panic: index 0 is always a valid index, and the array is nonempty now
60        data = data.const_bitor_assign_or_panic(0, 0b10000000);
61        (Self { data }, varint_array.len())
62    }
63
64    /// Prepends a branch node to the front of the builder. Returns the new builder
65    /// and the delta in length, which depends on the size of the varint.
66    #[must_use]
67    const fn prepend_branch(self, value: usize) -> (Self, usize) {
68        let mut data = self.data;
69        let varint_array = varint::write_varint_meta2(value);
70        // Can panic (as documented in type-level docs):
71        data = data.const_extend_front_or_panic(varint_array.as_const_slice());
72        // Shouldn't panic: index 0 is always a valid index, and the array is nonempty now
73        data = data.const_bitor_assign_or_panic(0, 0b11000000);
74        (Self { data }, varint_array.len())
75    }
76
77    /// Prepends multiple arbitrary bytes to the front of the builder. Returns the new builder
78    /// and the delta in length, which is the length of the slice.
79    #[must_use]
80    const fn prepend_slice(self, s: ConstSlice<u8>) -> (Self, usize) {
81        let mut data = self.data;
82        let mut i = s.len();
83        while i > 0 {
84            // Can panic (as documented in type-level docs):
85            data = data.const_push_front_or_panic(*s.get_or_panic(i - 1));
86            i -= 1;
87        }
88        (Self { data }, s.len())
89    }
90
91    /// Prepends multiple zeros to the front of the builder. Returns the new builder.
92    #[must_use]
93    const fn prepend_n_zeros(self, n: usize) -> Self {
94        let mut data = self.data;
95        let mut i = 0;
96        while i < n {
97            // Can panic (as documented in type-level docs):
98            data = data.const_push_front_or_panic(0);
99            i += 1;
100        }
101        Self { data }
102    }
103
104    /// Performs the operation `self[index] |= bits`
105    const fn bitor_assign_at_or_panic(self, index: usize, bits: u8) -> Self {
106        let mut data = self.data;
107        data = data.const_bitor_assign_or_panic(index, bits);
108        Self { data }
109    }
110
111    /// Creates a new builder containing the elements in the given slice of key/value pairs.
112    ///
113    /// `K` is the stack size of the lengths stack. If you get an error such as
114    /// "AsciiTrie Builder: Need more stack", try increasing `K`.
115    ///
116    /// # Panics
117    ///
118    /// Panics if the items are not sorted
119    pub const fn from_tuple_slice<'a, const K: usize>(
120        items: &[(&'a ByteStr, usize)],
121    ) -> Result<Self, ZeroTrieBuildError> {
122        let items = ConstSlice::from_slice(items);
123        let mut prev: Option<&'a ByteStr> = None;
124        {
    let mut i = 0;
    while i < items.len() {
        let (ascii_str, _) = items.get_or_panic(i);
        {
            match prev {
                None => (),
                Some(prev) => {
                    if !prev.is_less_then(ascii_str) {
                        {
                            ::core::panicking::panic_fmt(format_args!("Strings in ByteStr constructor are not sorted"));
                        };
                    }
                }
            };
            prev = Some(ascii_str)
        };
        i += 1;
    }
};const_for_each!(items, (ascii_str, _), {
125            match prev {
126                None => (),
127                Some(prev) => {
128                    if !prev.is_less_then(ascii_str) {
129                        panic!("Strings in ByteStr constructor are not sorted");
130                    }
131                }
132            };
133            prev = Some(ascii_str)
134        });
135        Self::from_sorted_const_tuple_slice::<K>(items)
136    }
137
138    /// Creates a new builder containing the elements in the given slice of key/value pairs.
139    ///
140    /// Assumes that the items are sorted. If they are not, unexpected behavior may occur.
141    ///
142    /// `K` is the stack size of the lengths stack. If you get an error such as
143    /// "AsciiTrie Builder: Need more stack", try increasing `K`.
144    pub const fn from_sorted_const_tuple_slice<const K: usize>(
145        items: ConstSlice<(&ByteStr, usize)>,
146    ) -> Result<Self, ZeroTrieBuildError> {
147        let mut result = Self::new();
148        let total_size;
149        (result, total_size) = result.create_or_panic::<K>(items);
150        if true {
    if !(total_size == result.data.len()) {
        ::core::panicking::panic("assertion failed: total_size == result.data.len()")
    };
};debug_assert!(total_size == result.data.len());
151        Ok(result)
152    }
153
154    /// The actual builder algorithm. For an explanation, see [`crate::builder`].
155    #[must_use]
156    const fn create_or_panic<const K: usize>(
157        mut self,
158        all_items: ConstSlice<(&ByteStr, usize)>,
159    ) -> (Self, usize) {
160        let mut prefix_len = match all_items.last() {
161            Some(x) => x.0.len(),
162            // Empty slice:
163            None => return (Self::new(), 0),
164        };
165        // Initialize the main loop to point at the last string.
166        let mut lengths_stack = ConstLengthsStack::<K>::new();
167        let mut i = all_items.len() - 1;
168        let mut j = all_items.len();
169        let mut current_len = 0;
170        // Start the main loop.
171        loop {
172            let item_i = all_items.get_or_panic(i);
173            let item_j = all_items.get_or_panic(j - 1);
174            if true {
    if !item_i.0.prefix_eq(item_j.0, prefix_len) {
        ::core::panicking::panic("assertion failed: item_i.0.prefix_eq(item_j.0, prefix_len)")
    };
};debug_assert!(item_i.0.prefix_eq(item_j.0, prefix_len));
175            // Check if we need to add a value node here.
176            if item_i.0.len() == prefix_len {
177                let len;
178                (self, len) = self.prepend_value(item_i.1);
179                current_len += len;
180            }
181            if prefix_len == 0 {
182                // All done! Leave the main loop.
183                break;
184            }
185            // Reduce the prefix length by 1 and recalculate i and j.
186            prefix_len -= 1;
187            let mut new_i = i;
188            let mut new_j = j;
189            let mut ascii_i = item_i.0.byte_at_or_panic(prefix_len);
190            let mut ascii_j = item_j.0.byte_at_or_panic(prefix_len);
191            if true {
    if !(ascii_i == ascii_j) {
        ::core::panicking::panic("assertion failed: ascii_i == ascii_j")
    };
};debug_assert!(ascii_i == ascii_j);
192            let key_ascii = ascii_i;
193            loop {
194                if new_i == 0 {
195                    break;
196                }
197                let candidate = all_items.get_or_panic(new_i - 1).0;
198                if candidate.len() < prefix_len {
199                    // Too short
200                    break;
201                }
202                if item_i.0.prefix_eq(candidate, prefix_len) {
203                    new_i -= 1;
204                } else {
205                    break;
206                }
207                if candidate.len() == prefix_len {
208                    // A string that equals the prefix does not take part in the branch node.
209                    break;
210                }
211                let candidate = candidate.byte_at_or_panic(prefix_len);
212                if candidate != ascii_i {
213                    ascii_i = candidate;
214                }
215            }
216            loop {
217                if new_j == all_items.len() {
218                    break;
219                }
220                let candidate = all_items.get_or_panic(new_j).0;
221                if candidate.len() < prefix_len {
222                    // Too short
223                    break;
224                }
225                if item_j.0.prefix_eq(candidate, prefix_len) {
226                    new_j += 1;
227                } else {
228                    break;
229                }
230                if candidate.len() == prefix_len {
231                    {
    ::core::panicking::panic_fmt(format_args!("A shorter string should be earlier in the sequence"));
};panic!("A shorter string should be earlier in the sequence");
232                }
233                let candidate = candidate.byte_at_or_panic(prefix_len);
234                if candidate != ascii_j {
235                    ascii_j = candidate;
236                }
237            }
238            // If there are no different bytes at this prefix level, we can add an ASCII or Span
239            // node and then continue to the next iteration of the main loop.
240            if ascii_i == key_ascii && ascii_j == key_ascii {
241                let len;
242                (self, len) = self.prepend_ascii(ascii_i);
243                current_len += len;
244                if true {
    if !(i == new_i || i == new_i + 1) {
        ::core::panicking::panic("assertion failed: i == new_i || i == new_i + 1")
    };
};debug_assert!(i == new_i || i == new_i + 1);
245                i = new_i;
246                if true {
    if !(j == new_j) {
        ::core::panicking::panic("assertion failed: j == new_j")
    };
};debug_assert!(j == new_j);
247                continue;
248            }
249            // If i and j changed, we are a target of a branch node.
250            if ascii_j == key_ascii {
251                // We are the _last_ target of a branch node.
252                lengths_stack = lengths_stack.push_or_panic(BranchMeta {
253                    ascii: key_ascii,
254                    cumulative_length: current_len,
255                    local_length: current_len,
256                    count: 1,
257                });
258            } else {
259                // We are the _not the last_ target of a branch node.
260                let BranchMeta {
261                    cumulative_length,
262                    count,
263                    ..
264                } = lengths_stack.peek_or_panic();
265                lengths_stack = lengths_stack.push_or_panic(BranchMeta {
266                    ascii: key_ascii,
267                    cumulative_length: cumulative_length + current_len,
268                    local_length: current_len,
269                    count: count + 1,
270                });
271            }
272            if ascii_i != key_ascii {
273                // We are _not the first_ target of a branch node.
274                // Set the cursor to the previous string and continue the loop.
275                j = i;
276                i -= 1;
277                prefix_len = all_items.get_or_panic(i).0.len();
278                current_len = 0;
279                continue;
280            }
281            // Branch (first)
282            let (total_length, total_count) = {
283                let BranchMeta {
284                    cumulative_length,
285                    count,
286                    ..
287                } = lengths_stack.peek_or_panic();
288                (cumulative_length, count)
289            };
290            let branch_metas;
291            (lengths_stack, branch_metas) = lengths_stack.pop_many_or_panic(total_count);
292            let original_keys = branch_metas.map_to_ascii_bytes();
293            // Write out the offset table
294            current_len = total_length;
295            const USIZE_BITS: usize = core::mem::size_of::<usize>() * 8;
296            let w = (USIZE_BITS - (total_length.leading_zeros() as usize) - 1) / 8;
297            if w > 3 {
298                { ::core::panicking::panic_fmt(format_args!("ZeroTrie capacity exceeded")); };panic!("ZeroTrie capacity exceeded");
299            }
300            let mut k = 0;
301            while k <= w {
302                self = self.prepend_n_zeros(total_count - 1);
303                current_len += total_count - 1;
304                let mut l = 0;
305                let mut length_to_write = 0;
306                while l < total_count {
307                    let BranchMeta { local_length, .. } = *branch_metas
308                        .as_const_slice()
309                        .get_or_panic(total_count - l - 1);
310                    let mut adjusted_length = length_to_write;
311                    let mut m = 0;
312                    while m < k {
313                        adjusted_length >>= 8;
314                        m += 1;
315                    }
316                    if l > 0 {
317                        self = self.bitor_assign_at_or_panic(l - 1, adjusted_length as u8);
318                    }
319                    l += 1;
320                    length_to_write += local_length;
321                }
322                k += 1;
323            }
324            // Write out the lookup table
325            if !(0 < total_count && total_count <= 256) {
    ::core::panicking::panic("assertion failed: 0 < total_count && total_count <= 256")
};assert!(0 < total_count && total_count <= 256);
326            let branch_value = (w << 8) + (total_count & 0xff);
327            let slice_len;
328            (self, slice_len) = self.prepend_slice(original_keys.as_const_slice());
329            let branch_len;
330            (self, branch_len) = self.prepend_branch(branch_value);
331            current_len += slice_len + branch_len;
332            i = new_i;
333            j = new_j;
334        }
335        if !lengths_stack.is_empty() {
    ::core::panicking::panic("assertion failed: lengths_stack.is_empty()")
};assert!(lengths_stack.is_empty());
336        (self, current_len)
337    }
338}