Skip to main content

zerotrie/builder/konst/
builder.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use super::super::branch_meta::BranchMeta;
6use super::super::slice_indices::{self, ByteSliceWithIndices};
7use super::store::ConstArrayBuilder;
8use super::store::ConstLengthsStack;
9use super::store::ConstSlice;
10use crate::varint;
11
12/// A low-level builder for [`ZeroTrieSimpleAscii`]. Works in const contexts.
13///
14/// All methods that grow the trie will panic if the capacity N is not enough.
15pub(crate) struct ZeroTrieBuilderConst<const N: usize> {
16    data: ConstArrayBuilder<N, u8>,
17}
18
19impl<const N: usize> ZeroTrieBuilderConst<N> {
20    /// Non-const function that returns the current trie data as a slice.
21    #[cfg(feature = "litemap")]
22    pub fn as_bytes(&self) -> &[u8] {
23        self.data.as_const_slice().as_slice()
24    }
25
26    /// Returns the trie data, panicking if the buffer is the wrong size.
27    pub const fn build_or_panic(self) -> [u8; N] {
28        self.data.const_build_or_panic()
29    }
30
31    /// Creates a new empty builder.
32    pub const fn new() -> Self {
33        Self {
34            data: ConstArrayBuilder::new_empty([0; N], N),
35        }
36    }
37
38    /// Prepends an ASCII node to the front of the builder. Returns the new builder
39    /// and the delta in length, which is always 1.
40    pub const fn prepend_ascii(&mut self, ascii: u8) -> usize {
41        if ascii >= 128 {
42            {
    ::core::panicking::panic_fmt(format_args!("Non-ASCII not supported in ZeroTrieSimpleAscii"));
};panic!("Non-ASCII not supported in ZeroTrieSimpleAscii");
43        }
44        self.data.const_push_front_or_panic(ascii);
45        1
46    }
47
48    /// Prepends a value node to the front of the builder. Returns the new builder
49    /// and the delta in length, which depends on the size of the varint.
50    pub const fn prepend_value(&mut self, value: usize) -> usize {
51        let varint_array = varint::write_varint_meta3(value);
52        // Can panic (as documented in class docs):
53        self.data
54            .const_extend_front_or_panic(varint_array.as_const_slice());
55        // Shouldn't panic: index 0 is always a valid index, and the array is nonempty now
56        self.data.const_bitor_assign_or_panic(0, 0b10000000);
57        varint_array.len()
58    }
59
60    /// Prepends a branch node to the front of the builder. Returns the new builder
61    /// and the delta in length, which depends on the size of the varint.
62    pub const fn prepend_branch(&mut self, value: usize) -> usize {
63        let varint_array = varint::write_varint_meta2(value);
64        // Can panic (as documented in type-level docs):
65        self.data
66            .const_extend_front_or_panic(varint_array.as_const_slice());
67        // Shouldn't panic: index 0 is always a valid index, and the array is nonempty now
68        self.data.const_bitor_assign_or_panic(0, 0b11000000);
69        varint_array.len()
70    }
71
72    /// Prepends multiple arbitrary bytes to the front of the builder. Returns the new builder
73    /// and the delta in length, which is the length of the slice.
74    pub const fn prepend_slice(&mut self, s: ConstSlice<u8>) -> usize {
75        let mut i = s.len();
76        while i > 0 {
77            // Can panic (as documented in type-level docs):
78            self.data.const_push_front_or_panic(*s.get_or_panic(i - 1));
79            i -= 1;
80        }
81        s.len()
82    }
83
84    /// Prepends multiple zeros to the front of the builder. Returns the new builder.
85    pub const fn prepend_n_zeros(&mut self, n: usize) {
86        let mut i = 0;
87        while i < n {
88            // Can panic (as documented in type-level docs):
89            self.data.const_push_front_or_panic(0);
90            i += 1;
91        }
92    }
93
94    /// Performs the operation `self[index] |= bits`
95    pub const fn bitor_assign_at_or_panic(&mut self, index: usize, bits: u8) {
96        self.data.const_bitor_assign_or_panic(index, bits);
97    }
98
99    /// Creates a new builder containing the elements in the given slice of key/value pairs.
100    ///
101    /// `K` is the stack size of the lengths stack. If you get an error such as
102    /// `AsciiTrie Builder: Need more stack`, try increasing `K`.
103    ///
104    /// # Panics
105    ///
106    /// Panics if the items are not sorted
107    pub const fn from_tuple_slice<'a, const K: usize>(items: ByteSliceWithIndices<'a>) -> Self {
108        if !items.is_sorted() {
    {
        ::core::panicking::panic_fmt(format_args!("Strings in ZeroTrie constructor are not sorted"));
    }
};assert!(
109            items.is_sorted(),
110            "Strings in ZeroTrie constructor are not sorted"
111        );
112        Self::from_slice_with_indices::<K>(items)
113    }
114
115    /// Creates a new builder containing the elements in the given slice of key/value pairs.
116    ///
117    /// Assumes that the items are sorted. If they are not, unexpected behavior may occur.
118    ///
119    /// `K` is the stack size of the lengths stack. If you get an error such as
120    /// `AsciiTrie Builder: Need more stack`, try increasing `K`.
121    pub const fn from_slice_with_indices<const K: usize>(items: ByteSliceWithIndices) -> Self {
122        let mut result = Self::new();
123        let total_size = result.create_or_panic::<K>(items);
124        if true {
    if !(total_size == result.data.len()) {
        ::core::panicking::panic("assertion failed: total_size == result.data.len()")
    };
};debug_assert!(total_size == result.data.len());
125        result
126    }
127
128    /// The actual builder algorithm. For an explanation, see [`crate::builder`].
129    #[must_use]
130    #[expect(
131        clippy::indexing_slicing,
132        reason = "all call sites are well-commented and will not panic"
133    )]
134    const fn create_or_panic<const K: usize>(&mut self, all_items: ByteSliceWithIndices) -> usize {
135        let mut prefix_len = match all_items.last() {
136            Some(x) => x.0.len(),
137            // Empty slice:
138            None => return 0,
139        };
140        // Initialize the main loop to point at the last string.
141        let mut lengths_stack = ConstLengthsStack::<K>::new();
142        let mut i = all_items.len() - 1;
143        let mut j = all_items.len();
144        let mut current_len = 0;
145        // Start the main loop.
146        loop {
147            let item_i = all_items.get_or_panic(i);
148            let item_j = all_items.get_or_panic(j - 1);
149            if true {
    if !slice_indices::prefix_eq_or_panic(item_i.0, item_j.0, prefix_len) {
        ::core::panicking::panic("assertion failed: slice_indices::prefix_eq_or_panic(item_i.0, item_j.0, prefix_len)")
    };
};debug_assert!(slice_indices::prefix_eq_or_panic(
150                item_i.0, item_j.0, prefix_len
151            ));
152            // Check if we need to add a value node here.
153            if item_i.0.len() == prefix_len {
154                current_len += self.prepend_value(item_i.1);
155            }
156            if prefix_len == 0 {
157                // All done! Leave the main loop.
158                break;
159            }
160            // Reduce the prefix length by 1 and recalculate i and j.
161            prefix_len -= 1;
162            let mut new_i = i;
163            let mut new_j = j;
164            // `prefix_len` is guaranteed to be in bounds for `item_i` since it is
165            // instantiated and updated based on the length of `item_i`, and then only decremented.
166            // This is a bit harder to prove for `item_j`, but it is a loop invariant
167            // TODO(sffc): Consider proving this or perhaps using a checked get.
168            let mut ascii_i = item_i.0[prefix_len];
169            let mut ascii_j = item_j.0[prefix_len];
170            if true {
    if !(ascii_i == ascii_j) {
        ::core::panicking::panic("assertion failed: ascii_i == ascii_j")
    };
};debug_assert!(ascii_i == ascii_j);
171            let key_ascii = ascii_i;
172            loop {
173                if new_i == 0 {
174                    break;
175                }
176                let candidate = all_items.get_or_panic(new_i - 1).0;
177                if candidate.len() < prefix_len {
178                    // Too short
179                    break;
180                }
181                if slice_indices::prefix_eq_or_panic(item_i.0, candidate, prefix_len) {
182                    new_i -= 1;
183                } else {
184                    break;
185                }
186
187                if candidate.len() == prefix_len {
188                    // A string that equals the prefix does not take part in the branch node.
189                    break;
190                }
191                let candidate = candidate[prefix_len];
192                if candidate != ascii_i {
193                    ascii_i = candidate;
194                }
195            }
196            loop {
197                if new_j == all_items.len() {
198                    break;
199                }
200                let candidate = all_items.get_or_panic(new_j).0;
201                if candidate.len() < prefix_len {
202                    // Too short
203                    break;
204                }
205                if slice_indices::prefix_eq_or_panic(item_j.0, candidate, prefix_len) {
206                    new_j += 1;
207                } else {
208                    break;
209                }
210                if candidate.len() == prefix_len {
211                    {
    ::core::panicking::panic_fmt(format_args!("A shorter string should be earlier in the sequence"));
};panic!("A shorter string should be earlier in the sequence");
212                }
213                let candidate = candidate[prefix_len];
214                if candidate != ascii_j {
215                    ascii_j = candidate;
216                }
217            }
218            // If there are no different bytes at this prefix level, we can add an ASCII or Span
219            // node and then continue to the next iteration of the main loop.
220            if ascii_i == key_ascii && ascii_j == key_ascii {
221                current_len += self.prepend_ascii(ascii_i);
222                if true {
    if !(i == new_i || i == new_i + 1) {
        ::core::panicking::panic("assertion failed: i == new_i || i == new_i + 1")
    };
};debug_assert!(i == new_i || i == new_i + 1);
223                i = new_i;
224                if true {
    if !(j == new_j) {
        ::core::panicking::panic("assertion failed: j == new_j")
    };
};debug_assert!(j == new_j);
225                continue;
226            }
227            // If i and j changed, we are a target of a branch node.
228            if ascii_j == key_ascii {
229                // We are the _last_ target of a branch node.
230                lengths_stack.push_or_panic(BranchMeta {
231                    ascii: key_ascii,
232                    cumulative_length: current_len,
233                    local_length: current_len,
234                    count: 1,
235                });
236            } else {
237                // We are the _not the last_ target of a branch node.
238                let BranchMeta {
239                    cumulative_length,
240                    count,
241                    ..
242                } = lengths_stack.peek_or_panic();
243                lengths_stack.push_or_panic(BranchMeta {
244                    ascii: key_ascii,
245                    cumulative_length: cumulative_length + current_len,
246                    local_length: current_len,
247                    count: count + 1,
248                });
249            }
250            if ascii_i != key_ascii {
251                // We are _not the first_ target of a branch node.
252                // Set the cursor to the previous string and continue the loop.
253                j = i;
254                i -= 1;
255                prefix_len = all_items.get_or_panic(i).0.len();
256                current_len = 0;
257                continue;
258            }
259            // Branch (first)
260            let (total_length, total_count) = {
261                let BranchMeta {
262                    cumulative_length,
263                    count,
264                    ..
265                } = lengths_stack.peek_or_panic();
266                (cumulative_length, count)
267            };
268            let branch_metas = lengths_stack.pop_many_or_panic(total_count);
269            let original_keys = branch_metas.map_to_ascii_bytes();
270            // Write out the offset table
271            current_len = total_length;
272            let w = (usize::BITS as usize - (total_length.leading_zeros() as usize) - 1) / 8;
273            if w > 3 {
274                { ::core::panicking::panic_fmt(format_args!("ZeroTrie capacity exceeded")); };panic!("ZeroTrie capacity exceeded");
275            }
276            let mut k = 0;
277            while k <= w {
278                self.prepend_n_zeros(total_count - 1);
279                current_len += total_count - 1;
280                let mut l = 0;
281                let mut length_to_write = 0;
282                while l < total_count {
283                    let BranchMeta { local_length, .. } = *branch_metas
284                        .as_const_slice()
285                        .get_or_panic(total_count - l - 1);
286                    let mut adjusted_length = length_to_write;
287                    let mut m = 0;
288                    while m < k {
289                        adjusted_length >>= 8;
290                        m += 1;
291                    }
292                    if l > 0 {
293                        self.bitor_assign_at_or_panic(l - 1, adjusted_length as u8);
294                    }
295                    l += 1;
296                    length_to_write += local_length;
297                }
298                k += 1;
299            }
300            // Write out the lookup table
301            if !(0 < total_count && total_count <= 256) {
    ::core::panicking::panic("assertion failed: 0 < total_count && total_count <= 256")
};assert!(0 < total_count && total_count <= 256);
302            let branch_value = (w << 8) + (total_count & 0xff);
303            current_len += self.prepend_slice(original_keys.as_const_slice());
304            current_len += self.prepend_branch(branch_value);
305            i = new_i;
306            j = new_j;
307        }
308        if !lengths_stack.is_empty() {
    ::core::panicking::panic("assertion failed: lengths_stack.is_empty()")
};assert!(lengths_stack.is_empty());
309        current_len
310    }
311}