1use crate::util::int::Usize;
23/// A representation of byte oriented equivalence classes.
4///
5/// This is used in finite state machines to reduce the size of the transition
6/// table. This can have a particularly large impact not only on the total size
7/// of an FSM, but also on FSM build times because it reduces the number of
8/// transitions that need to be visited/set.
9#[derive(#[automatically_derived]
impl ::core::clone::Clone for ByteClasses {
#[inline]
fn clone(&self) -> ByteClasses {
let _: ::core::clone::AssertParamIsClone<[u8; 256]>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for ByteClasses { }Copy)]
10pub(crate) struct ByteClasses([u8; 256]);
1112impl ByteClasses {
13/// Creates a new set of equivalence classes where all bytes are mapped to
14 /// the same class.
15pub(crate) fn empty() -> ByteClasses {
16ByteClasses([0; 256])
17 }
1819/// Creates a new set of equivalence classes where each byte belongs to
20 /// its own equivalence class.
21pub(crate) fn singletons() -> ByteClasses {
22let mut classes = ByteClasses::empty();
23for b in 0..=255 {
24 classes.set(b, b);
25 }
26classes27 }
2829/// Set the equivalence class for the given byte.
30#[inline]
31pub(crate) fn set(&mut self, byte: u8, class: u8) {
32self.0[usize::from(byte)] = class;
33 }
3435/// Get the equivalence class for the given byte.
36#[inline]
37pub(crate) fn get(&self, byte: u8) -> u8 {
38self.0[usize::from(byte)]
39 }
4041/// Return the total number of elements in the alphabet represented by
42 /// these equivalence classes. Equivalently, this returns the total number
43 /// of equivalence classes.
44#[inline]
45pub(crate) fn alphabet_len(&self) -> usize {
46// Add one since the number of equivalence classes is one bigger than
47 // the last one.
48usize::from(self.0[255]) + 1
49}
5051/// Returns the stride, as a base-2 exponent, required for these
52 /// equivalence classes.
53 ///
54 /// The stride is always the smallest power of 2 that is greater than or
55 /// equal to the alphabet length. This is done so that converting between
56 /// state IDs and indices can be done with shifts alone, which is much
57 /// faster than integer division. The "stride2" is the exponent. i.e.,
58 /// `2^stride2 = stride`.
59pub(crate) fn stride2(&self) -> usize {
60let zeros = self.alphabet_len().next_power_of_two().trailing_zeros();
61usize::try_from(zeros).unwrap()
62 }
6364/// Returns the stride for these equivalence classes, which corresponds
65 /// to the smallest power of 2 greater than or equal to the number of
66 /// equivalence classes.
67pub(crate) fn stride(&self) -> usize {
681 << self.stride2()
69 }
7071/// Returns true if and only if every byte in this class maps to its own
72 /// equivalence class. Equivalently, there are 257 equivalence classes
73 /// and each class contains exactly one byte (plus the special EOI class).
74#[inline]
75pub(crate) fn is_singleton(&self) -> bool {
76self.alphabet_len() == 256
77}
7879/// Returns an iterator over all equivalence classes in this set.
80pub(crate) fn iter(&self) -> ByteClassIter {
81ByteClassIter { it: 0..self.alphabet_len() }
82 }
8384/// Returns an iterator of the bytes in the given equivalence class.
85pub(crate) fn elements(&self, class: u8) -> ByteClassElements {
86ByteClassElements { classes: self, class, bytes: 0..=255 }
87 }
8889/// Returns an iterator of byte ranges in the given equivalence class.
90 ///
91 /// That is, a sequence of contiguous ranges are returned. Typically, every
92 /// class maps to a single contiguous range.
93fn element_ranges(&self, class: u8) -> ByteClassElementRanges {
94ByteClassElementRanges { elements: self.elements(class), range: None }
95 }
96}
9798impl core::fmt::Debugfor ByteClasses {
99fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
100if self.is_singleton() {
101f.write_fmt(format_args!("ByteClasses(<one-class-per-byte>)"))write!(f, "ByteClasses(<one-class-per-byte>)")102 } else {
103f.write_fmt(format_args!("ByteClasses("))write!(f, "ByteClasses(")?;
104for (i, class) in self.iter().enumerate() {
105if i > 0 {
106f.write_fmt(format_args!(", "))write!(f, ", ")?;
107 }
108f.write_fmt(format_args!("{0:?} => [", class))write!(f, "{:?} => [", class)?;
109for (start, end) in self.element_ranges(class) {
110if start == end {
111f.write_fmt(format_args!("{0:?}", start))write!(f, "{:?}", start)?;
112 } else {
113f.write_fmt(format_args!("{0:?}-{1:?}", start, end))write!(f, "{:?}-{:?}", start, end)?;
114 }
115 }
116f.write_fmt(format_args!("]"))write!(f, "]")?;
117 }
118f.write_fmt(format_args!(")"))write!(f, ")")119 }
120 }
121}
122123/// An iterator over each equivalence class.
124#[derive(#[automatically_derived]
impl ::core::fmt::Debug for ByteClassIter {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field1_finish(f, "ByteClassIter",
"it", &&self.it)
}
}Debug)]
125pub(crate) struct ByteClassIter {
126 it: core::ops::Range<usize>,
127}
128129impl Iteratorfor ByteClassIter {
130type Item = u8;
131132fn next(&mut self) -> Option<u8> {
133self.it.next().map(|class| class.as_u8())
134 }
135}
136137/// An iterator over all elements in a specific equivalence class.
138#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for ByteClassElements<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field3_finish(f,
"ByteClassElements", "classes", &self.classes, "class",
&self.class, "bytes", &&self.bytes)
}
}Debug)]
139pub(crate) struct ByteClassElements<'a> {
140 classes: &'a ByteClasses,
141 class: u8,
142 bytes: core::ops::RangeInclusive<u8>,
143}
144145impl<'a> Iteratorfor ByteClassElements<'a> {
146type Item = u8;
147148fn next(&mut self) -> Option<u8> {
149while let Some(byte) = self.bytes.next() {
150if self.class == self.classes.get(byte) {
151return Some(byte);
152 }
153 }
154None155 }
156}
157158/// An iterator over all elements in an equivalence class expressed as a
159/// sequence of contiguous ranges.
160#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for ByteClassElementRanges<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field2_finish(f,
"ByteClassElementRanges", "elements", &self.elements, "range",
&&self.range)
}
}Debug)]
161pub(crate) struct ByteClassElementRanges<'a> {
162 elements: ByteClassElements<'a>,
163 range: Option<(u8, u8)>,
164}
165166impl<'a> Iteratorfor ByteClassElementRanges<'a> {
167type Item = (u8, u8);
168169fn next(&mut self) -> Option<(u8, u8)> {
170loop {
171let element = match self.elements.next() {
172None => return self.range.take(),
173Some(element) => element,
174 };
175match self.range.take() {
176None => {
177self.range = Some((element, element));
178 }
179Some((start, end)) => {
180if usize::from(end) + 1 != usize::from(element) {
181self.range = Some((element, element));
182return Some((start, end));
183 }
184self.range = Some((start, element));
185 }
186 }
187 }
188 }
189}
190191/// A partitioning of bytes into equivalence classes.
192///
193/// A byte class set keeps track of an *approximation* of equivalence classes
194/// of bytes during NFA construction. That is, every byte in an equivalence
195/// class cannot discriminate between a match and a non-match.
196///
197/// Note that this may not compute the minimal set of equivalence classes.
198/// Basically, any byte in a pattern given to the noncontiguous NFA builder
199/// will automatically be treated as its own equivalence class. All other
200/// bytes---any byte not in any pattern---will be treated as their own
201/// equivalence classes. In theory, all bytes not in any pattern should
202/// be part of a single equivalence class, but in practice, we only treat
203/// contiguous ranges of bytes as an equivalence class. So the number of
204/// classes computed may be bigger than necessary. This usually doesn't make
205/// much of a difference, and keeps the implementation simple.
206#[derive(#[automatically_derived]
impl ::core::clone::Clone for ByteClassSet {
#[inline]
fn clone(&self) -> ByteClassSet {
ByteClassSet(::core::clone::Clone::clone(&self.0))
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for ByteClassSet {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_tuple_field1_finish(f, "ByteClassSet",
&&self.0)
}
}Debug)]
207pub(crate) struct ByteClassSet(ByteSet);
208209impl Defaultfor ByteClassSet {
210fn default() -> ByteClassSet {
211ByteClassSet::empty()
212 }
213}
214215impl ByteClassSet {
216/// Create a new set of byte classes where all bytes are part of the same
217 /// equivalence class.
218pub(crate) fn empty() -> Self {
219ByteClassSet(ByteSet::empty())
220 }
221222/// Indicate the the range of byte given (inclusive) can discriminate a
223 /// match between it and all other bytes outside of the range.
224pub(crate) fn set_range(&mut self, start: u8, end: u8) {
225if true {
if !(start <= end) {
::core::panicking::panic("assertion failed: start <= end")
};
};debug_assert!(start <= end);
226if start > 0 {
227self.0.add(start - 1);
228 }
229self.0.add(end);
230 }
231232/// Convert this boolean set to a map that maps all byte values to their
233 /// corresponding equivalence class. The last mapping indicates the largest
234 /// equivalence class identifier (which is never bigger than 255).
235pub(crate) fn byte_classes(&self) -> ByteClasses {
236let mut classes = ByteClasses::empty();
237let mut class = 0u8;
238let mut b = 0u8;
239loop {
240classes.set(b, class);
241if b == 255 {
242break;
243 }
244if self.0.contains(b) {
245class = class.checked_add(1).unwrap();
246 }
247b = b.checked_add(1).unwrap();
248 }
249classes250 }
251}
252253/// A simple set of bytes that is reasonably cheap to copy and allocation free.
254#[derive(#[automatically_derived]
impl ::core::clone::Clone for ByteSet {
#[inline]
fn clone(&self) -> ByteSet {
let _: ::core::clone::AssertParamIsClone<BitSet>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for ByteSet { }Copy, #[automatically_derived]
impl ::core::fmt::Debug for ByteSet {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field1_finish(f, "ByteSet",
"bits", &&self.bits)
}
}Debug, #[automatically_derived]
impl ::core::default::Default for ByteSet {
#[inline]
fn default() -> ByteSet {
ByteSet { bits: ::core::default::Default::default() }
}
}Default, #[automatically_derived]
impl ::core::cmp::Eq for ByteSet {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<BitSet>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for ByteSet {
#[inline]
fn eq(&self, other: &ByteSet) -> bool { self.bits == other.bits }
}PartialEq)]
255pub(crate) struct ByteSet {
256 bits: BitSet,
257}
258259/// The representation of a byte set. Split out so that we can define a
260/// convenient Debug impl for it while keeping "ByteSet" in the output.
261#[derive(#[automatically_derived]
impl ::core::clone::Clone for BitSet {
#[inline]
fn clone(&self) -> BitSet {
let _: ::core::clone::AssertParamIsClone<[u128; 2]>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for BitSet { }Copy, #[automatically_derived]
impl ::core::default::Default for BitSet {
#[inline]
fn default() -> BitSet { BitSet(::core::default::Default::default()) }
}Default, #[automatically_derived]
impl ::core::cmp::Eq for BitSet {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<[u128; 2]>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for BitSet {
#[inline]
fn eq(&self, other: &BitSet) -> bool { self.0 == other.0 }
}PartialEq)]
262struct BitSet([u128; 2]);
263264impl ByteSet {
265/// Create an empty set of bytes.
266pub(crate) fn empty() -> ByteSet {
267ByteSet { bits: BitSet([0; 2]) }
268 }
269270/// Add a byte to this set.
271 ///
272 /// If the given byte already belongs to this set, then this is a no-op.
273pub(crate) fn add(&mut self, byte: u8) {
274let bucket = byte / 128;
275let bit = byte % 128;
276self.bits.0[usize::from(bucket)] |= 1 << bit;
277 }
278279/// Return true if and only if the given byte is in this set.
280pub(crate) fn contains(&self, byte: u8) -> bool {
281let bucket = byte / 128;
282let bit = byte % 128;
283self.bits.0[usize::from(bucket)] & (1 << bit) > 0
284}
285}
286287impl core::fmt::Debugfor BitSet {
288fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
289let mut fmtd = f.debug_set();
290for b in 0u8..=255 {
291if (ByteSet { bits: *self }).contains(b) {
292 fmtd.entry(&b);
293 }
294 }
295fmtd.finish()
296 }
297}
298299#[cfg(test)]
300mod tests {
301use alloc::{vec, vec::Vec};
302303use super::*;
304305#[test]
306fn byte_classes() {
307let mut set = ByteClassSet::empty();
308 set.set_range(b'a', b'z');
309310let classes = set.byte_classes();
311assert_eq!(classes.get(0), 0);
312assert_eq!(classes.get(1), 0);
313assert_eq!(classes.get(2), 0);
314assert_eq!(classes.get(b'a' - 1), 0);
315assert_eq!(classes.get(b'a'), 1);
316assert_eq!(classes.get(b'm'), 1);
317assert_eq!(classes.get(b'z'), 1);
318assert_eq!(classes.get(b'z' + 1), 2);
319assert_eq!(classes.get(254), 2);
320assert_eq!(classes.get(255), 2);
321322let mut set = ByteClassSet::empty();
323 set.set_range(0, 2);
324 set.set_range(4, 6);
325let classes = set.byte_classes();
326assert_eq!(classes.get(0), 0);
327assert_eq!(classes.get(1), 0);
328assert_eq!(classes.get(2), 0);
329assert_eq!(classes.get(3), 1);
330assert_eq!(classes.get(4), 2);
331assert_eq!(classes.get(5), 2);
332assert_eq!(classes.get(6), 2);
333assert_eq!(classes.get(7), 3);
334assert_eq!(classes.get(255), 3);
335 }
336337#[test]
338fn full_byte_classes() {
339let mut set = ByteClassSet::empty();
340for b in 0u8..=255 {
341 set.set_range(b, b);
342 }
343assert_eq!(set.byte_classes().alphabet_len(), 256);
344 }
345346#[test]
347fn elements_typical() {
348let mut set = ByteClassSet::empty();
349 set.set_range(b'b', b'd');
350 set.set_range(b'g', b'm');
351 set.set_range(b'z', b'z');
352let classes = set.byte_classes();
353// class 0: \x00-a
354 // class 1: b-d
355 // class 2: e-f
356 // class 3: g-m
357 // class 4: n-y
358 // class 5: z-z
359 // class 6: \x7B-\xFF
360assert_eq!(classes.alphabet_len(), 7);
361362let elements = classes.elements(0).collect::<Vec<_>>();
363assert_eq!(elements.len(), 98);
364assert_eq!(elements[0], b'\x00');
365assert_eq!(elements[97], b'a');
366367let elements = classes.elements(1).collect::<Vec<_>>();
368assert_eq!(elements, vec![b'b', b'c', b'd'],);
369370let elements = classes.elements(2).collect::<Vec<_>>();
371assert_eq!(elements, vec![b'e', b'f'],);
372373let elements = classes.elements(3).collect::<Vec<_>>();
374assert_eq!(elements, vec![b'g', b'h', b'i', b'j', b'k', b'l', b'm',],);
375376let elements = classes.elements(4).collect::<Vec<_>>();
377assert_eq!(elements.len(), 12);
378assert_eq!(elements[0], b'n');
379assert_eq!(elements[11], b'y');
380381let elements = classes.elements(5).collect::<Vec<_>>();
382assert_eq!(elements, vec![b'z']);
383384let elements = classes.elements(6).collect::<Vec<_>>();
385assert_eq!(elements.len(), 133);
386assert_eq!(elements[0], b'\x7B');
387assert_eq!(elements[132], b'\xFF');
388 }
389390#[test]
391fn elements_singletons() {
392let classes = ByteClasses::singletons();
393assert_eq!(classes.alphabet_len(), 256);
394395let elements = classes.elements(b'a').collect::<Vec<_>>();
396assert_eq!(elements, vec![b'a']);
397 }
398399#[test]
400fn elements_empty() {
401let classes = ByteClasses::empty();
402assert_eq!(classes.alphabet_len(), 1);
403404let elements = classes.elements(0).collect::<Vec<_>>();
405assert_eq!(elements.len(), 256);
406assert_eq!(elements[0], b'\x00');
407assert_eq!(elements[255], b'\xFF');
408 }
409}