1/*!
2This module provides APIs for dealing with the alphabets of finite state
3machines.
45There are two principal types in this module, [`ByteClasses`] and [`Unit`].
6The former defines the alphabet of a finite state machine while the latter
7represents an element of that alphabet.
89To a first approximation, the alphabet of all automata in this crate is just
10a `u8`. Namely, every distinct byte value. All 256 of them. In practice, this
11can be quite wasteful when building a transition table for a DFA, since it
12requires storing a state identifier for each element in the alphabet. Instead,
13we collapse the alphabet of an automaton down into equivalence classes, where
14every byte in the same equivalence class never discriminates between a match or
15a non-match from any other byte in the same class. For example, in the regex
16`[a-z]+`, then you could consider it having an alphabet consisting of two
17equivalence classes: `a-z` and everything else. In terms of the transitions on
18an automaton, it doesn't actually require representing every distinct byte.
19Just the equivalence classes.
2021The downside of equivalence classes is that, of course, searching a haystack
22deals with individual byte values. Those byte values need to be mapped to
23their corresponding equivalence class. This is what `ByteClasses` does. In
24practice, doing this for every state transition has negligible impact on modern
25CPUs. Moreover, it helps make more efficient use of the CPU cache by (possibly
26considerably) shrinking the size of the transition table.
2728One last hiccup concerns `Unit`. Namely, because of look-around and how the
29DFAs in this crate work, we need to add a sentinel value to our alphabet
30of equivalence classes that represents the "end" of a search. We call that
31sentinel [`Unit::eoi`] or "end of input." Thus, a `Unit` is either an
32equivalence class corresponding to a set of bytes, or it is a special "end of
33input" sentinel.
3435In general, you should not expect to need either of these types unless you're
36doing lower level shenanigans with DFAs, or even building your own DFAs.
37(Although, you don't have to use these types to build your own DFAs of course.)
38For example, if you're walking a DFA's state graph, it's probably useful to
39make use of [`ByteClasses`] to visit each element in the DFA's alphabet instead
40of just visiting every distinct `u8` value. The latter isn't necessarily wrong,
41but it could be potentially very wasteful.
42*/
43use crate::util::{
44escape::DebugByte,
45 wire::{self, DeserializeError, SerializeError},
46};
4748/// Unit represents a single unit of haystack for DFA based regex engines.
49///
50/// It is not expected for consumers of this crate to need to use this type
51/// unless they are implementing their own DFA. And even then, it's not
52/// required: implementors may use other techniques to handle haystack units.
53///
54/// Typically, a single unit of haystack for a DFA would be a single byte.
55/// However, for the DFAs in this crate, matches are delayed by a single byte
56/// in order to handle look-ahead assertions (`\b`, `$` and `\z`). Thus, once
57/// we have consumed the haystack, we must run the DFA through one additional
58/// transition using a unit that indicates the haystack has ended.
59///
60/// There is no way to represent a sentinel with a `u8` since all possible
61/// values *may* be valid haystack units to a DFA, therefore this type
62/// explicitly adds room for a sentinel value.
63///
64/// The sentinel EOI value is always its own equivalence class and is
65/// ultimately represented by adding 1 to the maximum equivalence class value.
66/// So for example, the regex `^[a-z]+$` might be split into the following
67/// equivalence classes:
68///
69/// ```text
70/// 0 => [\x00-`]
71/// 1 => [a-z]
72/// 2 => [{-\xFF]
73/// 3 => [EOI]
74/// ```
75///
76/// Where EOI is the special sentinel value that is always in its own
77/// singleton equivalence class.
78#[derive(#[automatically_derived]
impl ::core::clone::Clone for Unit {
#[inline]
fn clone(&self) -> Unit {
let _: ::core::clone::AssertParamIsClone<UnitKind>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for Unit { }Copy, #[automatically_derived]
impl ::core::cmp::Eq for Unit {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<UnitKind>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for Unit {
#[inline]
fn eq(&self, other: &Unit) -> bool { self.0 == other.0 }
}PartialEq, #[automatically_derived]
impl ::core::cmp::PartialOrd for Unit {
#[inline]
fn partial_cmp(&self, other: &Unit)
-> ::core::option::Option<::core::cmp::Ordering> {
::core::cmp::PartialOrd::partial_cmp(&self.0, &other.0)
}
}PartialOrd, #[automatically_derived]
impl ::core::cmp::Ord for Unit {
#[inline]
fn cmp(&self, other: &Unit) -> ::core::cmp::Ordering {
::core::cmp::Ord::cmp(&self.0, &other.0)
}
}Ord)]
79pub struct Unit(UnitKind);
8081#[derive(#[automatically_derived]
impl ::core::clone::Clone for UnitKind {
#[inline]
fn clone(&self) -> UnitKind {
let _: ::core::clone::AssertParamIsClone<u8>;
let _: ::core::clone::AssertParamIsClone<u16>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for UnitKind { }Copy, #[automatically_derived]
impl ::core::cmp::Eq for UnitKind {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<u8>;
let _: ::core::cmp::AssertParamIsEq<u16>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for UnitKind {
#[inline]
fn eq(&self, other: &UnitKind) -> bool {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
__self_discr == __arg1_discr &&
match (self, other) {
(UnitKind::U8(__self_0), UnitKind::U8(__arg1_0)) =>
__self_0 == __arg1_0,
(UnitKind::EOI(__self_0), UnitKind::EOI(__arg1_0)) =>
__self_0 == __arg1_0,
_ => unsafe { ::core::intrinsics::unreachable() }
}
}
}PartialEq, #[automatically_derived]
impl ::core::cmp::PartialOrd for UnitKind {
#[inline]
fn partial_cmp(&self, other: &UnitKind)
-> ::core::option::Option<::core::cmp::Ordering> {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
match (self, other) {
(UnitKind::U8(__self_0), UnitKind::U8(__arg1_0)) =>
::core::cmp::PartialOrd::partial_cmp(__self_0, __arg1_0),
(UnitKind::EOI(__self_0), UnitKind::EOI(__arg1_0)) =>
::core::cmp::PartialOrd::partial_cmp(__self_0, __arg1_0),
_ =>
::core::cmp::PartialOrd::partial_cmp(&__self_discr,
&__arg1_discr),
}
}
}PartialOrd, #[automatically_derived]
impl ::core::cmp::Ord for UnitKind {
#[inline]
fn cmp(&self, other: &UnitKind) -> ::core::cmp::Ordering {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
match ::core::cmp::Ord::cmp(&__self_discr, &__arg1_discr) {
::core::cmp::Ordering::Equal =>
match (self, other) {
(UnitKind::U8(__self_0), UnitKind::U8(__arg1_0)) =>
::core::cmp::Ord::cmp(__self_0, __arg1_0),
(UnitKind::EOI(__self_0), UnitKind::EOI(__arg1_0)) =>
::core::cmp::Ord::cmp(__self_0, __arg1_0),
_ => unsafe { ::core::intrinsics::unreachable() }
},
cmp => cmp,
}
}
}Ord)]
82enum UnitKind {
83/// Represents a byte value, or more typically, an equivalence class
84 /// represented as a byte value.
85U8(u8),
86/// Represents the "end of input" sentinel. We regrettably use a `u16`
87 /// here since the maximum sentinel value is `256`. Thankfully, we don't
88 /// actually store a `Unit` anywhere, so this extra space shouldn't be too
89 /// bad.
90EOI(u16),
91}
9293impl Unit {
94/// Create a new haystack unit from a byte value.
95 ///
96 /// All possible byte values are legal. However, when creating a haystack
97 /// unit for a specific DFA, one should be careful to only construct units
98 /// that are in that DFA's alphabet. Namely, one way to compact a DFA's
99 /// in-memory representation is to collapse its transitions to a set of
100 /// equivalence classes into a set of all possible byte values. If a DFA
101 /// uses equivalence classes instead of byte values, then the byte given
102 /// here should be the equivalence class.
103pub fn u8(byte: u8) -> Unit {
104Unit(UnitKind::U8(byte))
105 }
106107/// Create a new "end of input" haystack unit.
108 ///
109 /// The value given is the sentinel value used by this unit to represent
110 /// the "end of input." The value should be the total number of equivalence
111 /// classes in the corresponding alphabet. Its maximum value is `256`,
112 /// which occurs when every byte is its own equivalence class.
113 ///
114 /// # Panics
115 ///
116 /// This panics when `num_byte_equiv_classes` is greater than `256`.
117pub fn eoi(num_byte_equiv_classes: usize) -> Unit {
118if !(num_byte_equiv_classes <= 256) {
{
::core::panicking::panic_fmt(format_args!("max number of byte-based equivalent classes is 256, but got {0}",
num_byte_equiv_classes));
}
};assert!(
119 num_byte_equiv_classes <= 256,
120"max number of byte-based equivalent classes is 256, but got \
121 {num_byte_equiv_classes}",
122 );
123Unit(UnitKind::EOI(u16::try_from(num_byte_equiv_classes).unwrap()))
124 }
125126/// If this unit is not an "end of input" sentinel, then returns its
127 /// underlying byte value. Otherwise return `None`.
128pub fn as_u8(self) -> Option<u8> {
129match self.0 {
130 UnitKind::U8(b) => Some(b),
131 UnitKind::EOI(_) => None,
132 }
133 }
134135/// If this unit is an "end of input" sentinel, then return the underlying
136 /// sentinel value that was given to [`Unit::eoi`]. Otherwise return
137 /// `None`.
138pub fn as_eoi(self) -> Option<u16> {
139match self.0 {
140 UnitKind::U8(_) => None,
141 UnitKind::EOI(sentinel) => Some(sentinel),
142 }
143 }
144145/// Return this unit as a `usize`, regardless of whether it is a byte value
146 /// or an "end of input" sentinel. In the latter case, the underlying
147 /// sentinel value given to [`Unit::eoi`] is returned.
148pub fn as_usize(self) -> usize {
149match self.0 {
150 UnitKind::U8(b) => usize::from(b),
151 UnitKind::EOI(eoi) => usize::from(eoi),
152 }
153 }
154155/// Returns true if and only of this unit is a byte value equivalent to the
156 /// byte given. This always returns false when this is an "end of input"
157 /// sentinel.
158pub fn is_byte(self, byte: u8) -> bool {
159self.as_u8().map_or(false, |b| b == byte)
160 }
161162/// Returns true when this unit represents an "end of input" sentinel.
163pub fn is_eoi(self) -> bool {
164self.as_eoi().is_some()
165 }
166167/// Returns true when this unit corresponds to an ASCII word byte.
168 ///
169 /// This always returns false when this unit represents an "end of input"
170 /// sentinel.
171pub fn is_word_byte(self) -> bool {
172self.as_u8().map_or(false, crate::util::utf8::is_word_byte)
173 }
174}
175176impl core::fmt::Debugfor Unit {
177fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
178match self.0 {
179 UnitKind::U8(b) => f.write_fmt(format_args!("{0:?}", DebugByte(b)))write!(f, "{:?}", DebugByte(b)),
180 UnitKind::EOI(_) => f.write_fmt(format_args!("EOI"))write!(f, "EOI"),
181 }
182 }
183}
184185/// A representation of byte oriented equivalence classes.
186///
187/// This is used in a DFA to reduce the size of the transition table. This can
188/// have a particularly large impact not only on the total size of a dense DFA,
189/// but also on compile times.
190///
191/// The essential idea here is that the alphabet of a DFA is shrunk from the
192/// usual 256 distinct byte values down to a set of equivalence classes. The
193/// guarantee you get is that any byte belonging to the same equivalence class
194/// can be treated as if it were any other byte in the same class, and the
195/// result of a search wouldn't change.
196///
197/// # Example
198///
199/// This example shows how to get byte classes from an
200/// [`NFA`](crate::nfa::thompson::NFA) and ask for the class of various bytes.
201///
202/// ```
203/// use regex_automata::nfa::thompson::NFA;
204///
205/// let nfa = NFA::new("[a-z]+")?;
206/// let classes = nfa.byte_classes();
207/// // 'a' and 'z' are in the same class for this regex.
208/// assert_eq!(classes.get(b'a'), classes.get(b'z'));
209/// // But 'a' and 'A' are not.
210/// assert_ne!(classes.get(b'a'), classes.get(b'A'));
211///
212/// # Ok::<(), Box<dyn std::error::Error>>(())
213/// ```
214#[derive(#[automatically_derived]
impl ::core::clone::Clone for ByteClasses {
#[inline]
fn clone(&self) -> ByteClasses {
let _: ::core::clone::AssertParamIsClone<[u8; 256]>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for ByteClasses { }Copy)]
215pub struct ByteClasses([u8; 256]);
216217impl ByteClasses {
218/// Creates a new set of equivalence classes where all bytes are mapped to
219 /// the same class.
220#[inline]
221pub fn empty() -> ByteClasses {
222ByteClasses([0; 256])
223 }
224225/// Creates a new set of equivalence classes where each byte belongs to
226 /// its own equivalence class.
227#[inline]
228pub fn singletons() -> ByteClasses {
229let mut classes = ByteClasses::empty();
230for b in 0..=255 {
231 classes.set(b, b);
232 }
233classes234 }
235236/// Deserializes a byte class map from the given slice. If the slice is of
237 /// insufficient length or otherwise contains an impossible mapping, then
238 /// an error is returned. Upon success, the number of bytes read along with
239 /// the map are returned. The number of bytes read is always a multiple of
240 /// 8.
241pub(crate) fn from_bytes(
242 slice: &[u8],
243 ) -> Result<(ByteClasses, usize), DeserializeError> {
244 wire::check_slice_len(slice, 256, "byte class map")?;
245let mut classes = ByteClasses::empty();
246for (b, &class) in slice[..256].iter().enumerate() {
247 classes.set(u8::try_from(b).unwrap(), class);
248 }
249// We specifically don't use 'classes.iter()' here because that
250 // iterator depends on 'classes.alphabet_len()' being correct. But that
251 // is precisely the thing we're trying to verify below!
252for &b in classes.0.iter() {
253if usize::from(b) >= classes.alphabet_len() {
254return Err(DeserializeError::generic(
255"found equivalence class greater than alphabet len",
256 ));
257 }
258 }
259Ok((classes, 256))
260 }
261262/// Writes this byte class map to the given byte buffer. if the given
263 /// buffer is too small, then an error is returned. Upon success, the total
264 /// number of bytes written is returned. The number of bytes written is
265 /// guaranteed to be a multiple of 8.
266pub(crate) fn write_to(
267&self,
268mut dst: &mut [u8],
269 ) -> Result<usize, SerializeError> {
270let nwrite = self.write_to_len();
271if dst.len() < nwrite {
272return Err(SerializeError::buffer_too_small("byte class map"));
273 }
274for b in 0..=255 {
275 dst[0] = self.get(b);
276 dst = &mut dst[1..];
277 }
278Ok(nwrite)
279 }
280281/// Returns the total number of bytes written by `write_to`.
282pub(crate) fn write_to_len(&self) -> usize {
283256
284}
285286/// Set the equivalence class for the given byte.
287#[inline]
288pub fn set(&mut self, byte: u8, class: u8) {
289self.0[usize::from(byte)] = class;
290 }
291292/// Get the equivalence class for the given byte.
293#[inline]
294pub fn get(&self, byte: u8) -> u8 {
295self.0[usize::from(byte)]
296 }
297298/// Get the equivalence class for the given haystack unit and return the
299 /// class as a `usize`.
300#[inline]
301pub fn get_by_unit(&self, unit: Unit) -> usize {
302match unit.0 {
303 UnitKind::U8(b) => usize::from(self.get(b)),
304 UnitKind::EOI(b) => usize::from(b),
305 }
306 }
307308/// Create a unit that represents the "end of input" sentinel based on the
309 /// number of equivalence classes.
310#[inline]
311pub fn eoi(&self) -> Unit {
312// The alphabet length already includes the EOI sentinel, hence why
313 // we subtract 1.
314Unit::eoi(self.alphabet_len().checked_sub(1).unwrap())
315 }
316317/// Return the total number of elements in the alphabet represented by
318 /// these equivalence classes. Equivalently, this returns the total number
319 /// of equivalence classes.
320#[inline]
321pub fn alphabet_len(&self) -> usize {
322// Add one since the number of equivalence classes is one bigger than
323 // the last one. But add another to account for the final EOI class
324 // that isn't explicitly represented.
325usize::from(self.0[255]) + 1 + 1
326}
327328/// Returns the stride, as a base-2 exponent, required for these
329 /// equivalence classes.
330 ///
331 /// The stride is always the smallest power of 2 that is greater than or
332 /// equal to the alphabet length, and the `stride2` returned here is the
333 /// exponent applied to `2` to get the smallest power. This is done so that
334 /// converting between premultiplied state IDs and indices can be done with
335 /// shifts alone, which is much faster than integer division.
336#[inline]
337pub fn stride2(&self) -> usize {
338let zeros = self.alphabet_len().next_power_of_two().trailing_zeros();
339usize::try_from(zeros).unwrap()
340 }
341342/// Returns true if and only if every byte in this class maps to its own
343 /// equivalence class. Equivalently, there are 257 equivalence classes
344 /// and each class contains either exactly one byte or corresponds to the
345 /// singleton class containing the "end of input" sentinel.
346#[inline]
347pub fn is_singleton(&self) -> bool {
348self.alphabet_len() == 257
349}
350351/// Returns an iterator over all equivalence classes in this set.
352#[inline]
353pub fn iter(&self) -> ByteClassIter<'_> {
354ByteClassIter { classes: self, i: 0 }
355 }
356357/// Returns an iterator over a sequence of representative bytes from each
358 /// equivalence class within the range of bytes given.
359 ///
360 /// When the given range is unbounded on both sides, the iterator yields
361 /// exactly N items, where N is equivalent to the number of equivalence
362 /// classes. Each item is an arbitrary byte drawn from each equivalence
363 /// class.
364 ///
365 /// This is useful when one is determinizing an NFA and the NFA's alphabet
366 /// hasn't been converted to equivalence classes. Picking an arbitrary byte
367 /// from each equivalence class then permits a full exploration of the NFA
368 /// instead of using every possible byte value and thus potentially saves
369 /// quite a lot of redundant work.
370 ///
371 /// # Example
372 ///
373 /// This shows an example of what a complete sequence of representatives
374 /// might look like from a real example.
375 ///
376 /// ```
377 /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit};
378 ///
379 /// let nfa = NFA::new("[a-z]+")?;
380 /// let classes = nfa.byte_classes();
381 /// let reps: Vec<Unit> = classes.representatives(..).collect();
382 /// // Note that the specific byte values yielded are not guaranteed!
383 /// let expected = vec![
384 /// Unit::u8(b'\x00'),
385 /// Unit::u8(b'a'),
386 /// Unit::u8(b'{'),
387 /// Unit::eoi(3),
388 /// ];
389 /// assert_eq!(expected, reps);
390 ///
391 /// # Ok::<(), Box<dyn std::error::Error>>(())
392 /// ```
393 ///
394 /// Note though, that you can ask for an arbitrary range of bytes, and only
395 /// representatives for that range will be returned:
396 ///
397 /// ```
398 /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit};
399 ///
400 /// let nfa = NFA::new("[a-z]+")?;
401 /// let classes = nfa.byte_classes();
402 /// let reps: Vec<Unit> = classes.representatives(b'A'..=b'z').collect();
403 /// // Note that the specific byte values yielded are not guaranteed!
404 /// let expected = vec![
405 /// Unit::u8(b'A'),
406 /// Unit::u8(b'a'),
407 /// ];
408 /// assert_eq!(expected, reps);
409 ///
410 /// # Ok::<(), Box<dyn std::error::Error>>(())
411 /// ```
412pub fn representatives<R: core::ops::RangeBounds<u8>>(
413&self,
414 range: R,
415 ) -> ByteClassRepresentatives<'_> {
416use core::ops::Bound;
417418let cur_byte = match range.start_bound() {
419 Bound::Included(&i) => usize::from(i),
420 Bound::Excluded(&i) => usize::from(i).checked_add(1).unwrap(),
421 Bound::Unbounded => 0,
422 };
423let end_byte = match range.end_bound() {
424 Bound::Included(&i) => {
425Some(usize::from(i).checked_add(1).unwrap())
426 }
427 Bound::Excluded(&i) => Some(usize::from(i)),
428 Bound::Unbounded => None,
429 };
430match (&(cur_byte), &(usize::MAX)) {
(left_val, right_val) => {
if *left_val == *right_val {
let kind = ::core::panicking::AssertKind::Ne;
::core::panicking::assert_failed(kind, &*left_val, &*right_val,
::core::option::Option::Some(format_args!("start range must be less than usize::MAX")));
}
}
};assert_ne!(
431 cur_byte,
432 usize::MAX,
433"start range must be less than usize::MAX",
434 );
435ByteClassRepresentatives {
436 classes: self,
437cur_byte,
438end_byte,
439 last_class: None,
440 }
441 }
442443/// Returns an iterator of the bytes in the given equivalence class.
444 ///
445 /// This is useful when one needs to know the actual bytes that belong to
446 /// an equivalence class. For example, conceptually speaking, accelerating
447 /// a DFA state occurs when a state only has a few outgoing transitions.
448 /// But in reality, what is required is that there are only a small
449 /// number of distinct bytes that can lead to an outgoing transition. The
450 /// difference is that any one transition can correspond to an equivalence
451 /// class which may contains many bytes. Therefore, DFA state acceleration
452 /// considers the actual elements in each equivalence class of each
453 /// outgoing transition.
454 ///
455 /// # Example
456 ///
457 /// This shows an example of how to get all of the elements in an
458 /// equivalence class.
459 ///
460 /// ```
461 /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit};
462 ///
463 /// let nfa = NFA::new("[a-z]+")?;
464 /// let classes = nfa.byte_classes();
465 /// let elements: Vec<Unit> = classes.elements(Unit::u8(1)).collect();
466 /// let expected: Vec<Unit> = (b'a'..=b'z').map(Unit::u8).collect();
467 /// assert_eq!(expected, elements);
468 ///
469 /// # Ok::<(), Box<dyn std::error::Error>>(())
470 /// ```
471#[inline]
472pub fn elements(&self, class: Unit) -> ByteClassElements<'_> {
473ByteClassElements { classes: self, class, byte: 0 }
474 }
475476/// Returns an iterator of byte ranges in the given equivalence class.
477 ///
478 /// That is, a sequence of contiguous ranges are returned. Typically, every
479 /// class maps to a single contiguous range.
480fn element_ranges(&self, class: Unit) -> ByteClassElementRanges<'_> {
481ByteClassElementRanges { elements: self.elements(class), range: None }
482 }
483}
484485impl Defaultfor ByteClasses {
486fn default() -> ByteClasses {
487ByteClasses::singletons()
488 }
489}
490491impl core::fmt::Debugfor ByteClasses {
492fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
493if self.is_singleton() {
494f.write_fmt(format_args!("ByteClasses({{singletons}})"))write!(f, "ByteClasses({{singletons}})")495 } else {
496f.write_fmt(format_args!("ByteClasses("))write!(f, "ByteClasses(")?;
497for (i, class) in self.iter().enumerate() {
498if i > 0 {
499f.write_fmt(format_args!(", "))write!(f, ", ")?;
500 }
501f.write_fmt(format_args!("{0:?} => [", class.as_usize()))write!(f, "{:?} => [", class.as_usize())?;
502for (start, end) in self.element_ranges(class) {
503if start == end {
504f.write_fmt(format_args!("{0:?}", start))write!(f, "{start:?}")?;
505 } else {
506f.write_fmt(format_args!("{0:?}-{1:?}", start, end))write!(f, "{start:?}-{end:?}")?;
507 }
508 }
509f.write_fmt(format_args!("]"))write!(f, "]")?;
510 }
511f.write_fmt(format_args!(")"))write!(f, ")")512 }
513 }
514}
515516/// An iterator over each equivalence class.
517///
518/// The last element in this iterator always corresponds to [`Unit::eoi`].
519///
520/// This is created by the [`ByteClasses::iter`] method.
521///
522/// The lifetime `'a` refers to the lifetime of the byte classes that this
523/// iterator was created from.
524#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for ByteClassIter<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field2_finish(f, "ByteClassIter",
"classes", &self.classes, "i", &&self.i)
}
}Debug)]
525pub struct ByteClassIter<'a> {
526 classes: &'a ByteClasses,
527 i: usize,
528}
529530impl<'a> Iteratorfor ByteClassIter<'a> {
531type Item = Unit;
532533fn next(&mut self) -> Option<Unit> {
534if self.i + 1 == self.classes.alphabet_len() {
535self.i += 1;
536Some(self.classes.eoi())
537 } else if self.i < self.classes.alphabet_len() {
538let class = u8::try_from(self.i).unwrap();
539self.i += 1;
540Some(Unit::u8(class))
541 } else {
542None543 }
544 }
545}
546547/// An iterator over representative bytes from each equivalence class.
548///
549/// This is created by the [`ByteClasses::representatives`] method.
550///
551/// The lifetime `'a` refers to the lifetime of the byte classes that this
552/// iterator was created from.
553#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for ByteClassRepresentatives<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field4_finish(f,
"ByteClassRepresentatives", "classes", &self.classes, "cur_byte",
&self.cur_byte, "end_byte", &self.end_byte, "last_class",
&&self.last_class)
}
}Debug)]
554pub struct ByteClassRepresentatives<'a> {
555 classes: &'a ByteClasses,
556 cur_byte: usize,
557 end_byte: Option<usize>,
558 last_class: Option<u8>,
559}
560561impl<'a> Iteratorfor ByteClassRepresentatives<'a> {
562type Item = Unit;
563564fn next(&mut self) -> Option<Unit> {
565while self.cur_byte < self.end_byte.unwrap_or(256) {
566let byte = u8::try_from(self.cur_byte).unwrap();
567let class = self.classes.get(byte);
568self.cur_byte += 1;
569570if self.last_class != Some(class) {
571self.last_class = Some(class);
572return Some(Unit::u8(byte));
573 }
574 }
575if self.cur_byte != usize::MAX && self.end_byte.is_none() {
576// Using usize::MAX as a sentinel is OK because we ban usize::MAX
577 // from appearing as a start bound in iterator construction. But
578 // why do it this way? Well, we want to return the EOI class
579 // whenever the end of the given range is unbounded because EOI
580 // isn't really a "byte" per se, so the only way it should be
581 // excluded is if there is a bounded end to the range. Therefore,
582 // when the end is unbounded, we just need to know whether we've
583 // reported EOI or not. When we do, we set cur_byte to a value it
584 // can never otherwise be.
585self.cur_byte = usize::MAX;
586return Some(self.classes.eoi());
587 }
588None589 }
590}
591592/// An iterator over all elements in an equivalence class.
593///
594/// This is created by the [`ByteClasses::elements`] method.
595///
596/// The lifetime `'a` refers to the lifetime of the byte classes that this
597/// iterator was created from.
598#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for ByteClassElements<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field3_finish(f,
"ByteClassElements", "classes", &self.classes, "class",
&self.class, "byte", &&self.byte)
}
}Debug)]
599pub struct ByteClassElements<'a> {
600 classes: &'a ByteClasses,
601 class: Unit,
602 byte: usize,
603}
604605impl<'a> Iteratorfor ByteClassElements<'a> {
606type Item = Unit;
607608fn next(&mut self) -> Option<Unit> {
609while self.byte < 256 {
610let byte = u8::try_from(self.byte).unwrap();
611self.byte += 1;
612if self.class.is_byte(self.classes.get(byte)) {
613return Some(Unit::u8(byte));
614 }
615 }
616if self.byte < 257 {
617self.byte += 1;
618if self.class.is_eoi() {
619return Some(Unit::eoi(256));
620 }
621 }
622None623 }
624}
625626/// An iterator over all elements in an equivalence class expressed as a
627/// sequence of contiguous ranges.
628#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for ByteClassElementRanges<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field2_finish(f,
"ByteClassElementRanges", "elements", &self.elements, "range",
&&self.range)
}
}Debug)]
629struct ByteClassElementRanges<'a> {
630 elements: ByteClassElements<'a>,
631 range: Option<(Unit, Unit)>,
632}
633634impl<'a> Iteratorfor ByteClassElementRanges<'a> {
635type Item = (Unit, Unit);
636637fn next(&mut self) -> Option<(Unit, Unit)> {
638loop {
639let element = match self.elements.next() {
640None => return self.range.take(),
641Some(element) => element,
642 };
643match self.range.take() {
644None => {
645self.range = Some((element, element));
646 }
647Some((start, end)) => {
648if end.as_usize() + 1 != element.as_usize()
649 || element.is_eoi()
650 {
651self.range = Some((element, element));
652return Some((start, end));
653 }
654self.range = Some((start, element));
655 }
656 }
657 }
658 }
659}
660661/// A partitioning of bytes into equivalence classes.
662///
663/// A byte class set keeps track of an *approximation* of equivalence classes
664/// of bytes during NFA construction. That is, every byte in an equivalence
665/// class cannot discriminate between a match and a non-match.
666///
667/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the
668/// same equivalence class because it never matters whether an `a` or a `b` is
669/// seen, and no combination of `a`s and `b`s in the text can discriminate a
670/// match.
671///
672/// Note though that this does not compute the minimal set of equivalence
673/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the
674/// same equivalence class for the same reason that `a` and `b` are in the
675/// same equivalence class in the aforementioned regex. However, in this
676/// implementation, `a` and `c` are put into distinct equivalence classes. The
677/// reason for this is implementation complexity. In the future, we should
678/// endeavor to compute the minimal equivalence classes since they can have a
679/// rather large impact on the size of the DFA. (Doing this will likely require
680/// rethinking how equivalence classes are computed, including changing the
681/// representation here, which is only able to group contiguous bytes into the
682/// same equivalence class.)
683#[cfg(feature = "alloc")]
684#[derive(#[automatically_derived]
impl ::core::clone::Clone for ByteClassSet {
#[inline]
fn clone(&self) -> ByteClassSet {
ByteClassSet(::core::clone::Clone::clone(&self.0))
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for ByteClassSet {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_tuple_field1_finish(f, "ByteClassSet",
&&self.0)
}
}Debug)]
685pub(crate) struct ByteClassSet(ByteSet);
686687#[cfg(feature = "alloc")]
688impl Defaultfor ByteClassSet {
689fn default() -> ByteClassSet {
690ByteClassSet::empty()
691 }
692}
693694#[cfg(feature = "alloc")]
695impl ByteClassSet {
696/// Create a new set of byte classes where all bytes are part of the same
697 /// equivalence class.
698pub(crate) fn empty() -> Self {
699ByteClassSet(ByteSet::empty())
700 }
701702/// Indicate the range of byte given (inclusive) can discriminate a
703 /// match between it and all other bytes outside of the range.
704pub(crate) fn set_range(&mut self, start: u8, end: u8) {
705if true {
if !(start <= end) {
::core::panicking::panic("assertion failed: start <= end")
};
};debug_assert!(start <= end);
706if start > 0 {
707self.0.add(start - 1);
708 }
709self.0.add(end);
710 }
711712/// Add the contiguous ranges in the set given to this byte class set.
713pub(crate) fn add_set(&mut self, set: &ByteSet) {
714for (start, end) in set.iter_ranges() {
715self.set_range(start, end);
716 }
717 }
718719/// Convert this boolean set to a map that maps all byte values to their
720 /// corresponding equivalence class. The last mapping indicates the largest
721 /// equivalence class identifier (which is never bigger than 255).
722pub(crate) fn byte_classes(&self) -> ByteClasses {
723let mut classes = ByteClasses::empty();
724let mut class = 0u8;
725let mut b = 0u8;
726loop {
727classes.set(b, class);
728if b == 255 {
729break;
730 }
731if self.0.contains(b) {
732class = class.checked_add(1).unwrap();
733 }
734b = b.checked_add(1).unwrap();
735 }
736classes737 }
738}
739740/// A simple set of bytes that is reasonably cheap to copy and allocation free.
741#[derive(#[automatically_derived]
impl ::core::clone::Clone for ByteSet {
#[inline]
fn clone(&self) -> ByteSet {
let _: ::core::clone::AssertParamIsClone<BitSet>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for ByteSet { }Copy, #[automatically_derived]
impl ::core::fmt::Debug for ByteSet {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field1_finish(f, "ByteSet",
"bits", &&self.bits)
}
}Debug, #[automatically_derived]
impl ::core::default::Default for ByteSet {
#[inline]
fn default() -> ByteSet {
ByteSet { bits: ::core::default::Default::default() }
}
}Default, #[automatically_derived]
impl ::core::cmp::Eq for ByteSet {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<BitSet>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for ByteSet {
#[inline]
fn eq(&self, other: &ByteSet) -> bool { self.bits == other.bits }
}PartialEq)]
742pub(crate) struct ByteSet {
743 bits: BitSet,
744}
745746/// The representation of a byte set. Split out so that we can define a
747/// convenient Debug impl for it while keeping "ByteSet" in the output.
748#[derive(#[automatically_derived]
impl ::core::clone::Clone for BitSet {
#[inline]
fn clone(&self) -> BitSet {
let _: ::core::clone::AssertParamIsClone<[u128; 2]>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for BitSet { }Copy, #[automatically_derived]
impl ::core::default::Default for BitSet {
#[inline]
fn default() -> BitSet { BitSet(::core::default::Default::default()) }
}Default, #[automatically_derived]
impl ::core::cmp::Eq for BitSet {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<[u128; 2]>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for BitSet {
#[inline]
fn eq(&self, other: &BitSet) -> bool { self.0 == other.0 }
}PartialEq)]
749struct BitSet([u128; 2]);
750751impl ByteSet {
752/// Create an empty set of bytes.
753pub(crate) fn empty() -> ByteSet {
754ByteSet { bits: BitSet([0; 2]) }
755 }
756757/// Add a byte to this set.
758 ///
759 /// If the given byte already belongs to this set, then this is a no-op.
760pub(crate) fn add(&mut self, byte: u8) {
761let bucket = byte / 128;
762let bit = byte % 128;
763self.bits.0[usize::from(bucket)] |= 1 << bit;
764 }
765766/// Remove a byte from this set.
767 ///
768 /// If the given byte is not in this set, then this is a no-op.
769pub(crate) fn remove(&mut self, byte: u8) {
770let bucket = byte / 128;
771let bit = byte % 128;
772self.bits.0[usize::from(bucket)] &= !(1 << bit);
773 }
774775/// Return true if and only if the given byte is in this set.
776pub(crate) fn contains(&self, byte: u8) -> bool {
777let bucket = byte / 128;
778let bit = byte % 128;
779self.bits.0[usize::from(bucket)] & (1 << bit) > 0
780}
781782/// Return true if and only if the given inclusive range of bytes is in
783 /// this set.
784pub(crate) fn contains_range(&self, start: u8, end: u8) -> bool {
785 (start..=end).all(|b| self.contains(b))
786 }
787788/// Returns an iterator over all bytes in this set.
789pub(crate) fn iter(&self) -> ByteSetIter<'_> {
790ByteSetIter { set: self, b: 0 }
791 }
792793/// Returns an iterator over all contiguous ranges of bytes in this set.
794pub(crate) fn iter_ranges(&self) -> ByteSetRangeIter<'_> {
795ByteSetRangeIter { set: self, b: 0 }
796 }
797798/// Return true if and only if this set is empty.
799#[cfg_attr(feature = "perf-inline", inline(always))]
800pub(crate) fn is_empty(&self) -> bool {
801self.bits.0 == [0, 0]
802 }
803804/// Deserializes a byte set from the given slice. If the slice is of
805 /// incorrect length or is otherwise malformed, then an error is returned.
806 /// Upon success, the number of bytes read along with the set are returned.
807 /// The number of bytes read is always a multiple of 8.
808pub(crate) fn from_bytes(
809 slice: &[u8],
810 ) -> Result<(ByteSet, usize), DeserializeError> {
811use core::mem::size_of;
812813 wire::check_slice_len(slice, 2 * size_of::<u128>(), "byte set")?;
814let mut nread = 0;
815let (low, nr) = wire::try_read_u128(slice, "byte set low bucket")?;
816nread += nr;
817let (high, nr) = wire::try_read_u128(slice, "byte set high bucket")?;
818nread += nr;
819Ok((ByteSet { bits: BitSet([low, high]) }, nread))
820 }
821822/// Writes this byte set to the given byte buffer. If the given buffer is
823 /// too small, then an error is returned. Upon success, the total number of
824 /// bytes written is returned. The number of bytes written is guaranteed to
825 /// be a multiple of 8.
826pub(crate) fn write_to<E: crate::util::wire::Endian>(
827&self,
828 dst: &mut [u8],
829 ) -> Result<usize, SerializeError> {
830use core::mem::size_of;
831832let nwrite = self.write_to_len();
833if dst.len() < nwrite {
834return Err(SerializeError::buffer_too_small("byte set"));
835 }
836let mut nw = 0;
837 E::write_u128(self.bits.0[0], &mut dst[nw..]);
838nw += size_of::<u128>();
839 E::write_u128(self.bits.0[1], &mut dst[nw..]);
840nw += size_of::<u128>();
841match (&nwrite, &nw) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val, &*right_val,
::core::option::Option::Some(format_args!("expected to write certain number of bytes")));
}
}
};assert_eq!(nwrite, nw, "expected to write certain number of bytes",);
842match (&(nw % 8), &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val, &*right_val,
::core::option::Option::Some(format_args!("expected to write multiple of 8 bytes for byte set")));
}
}
};assert_eq!(
843 nw % 8,
8440,
845"expected to write multiple of 8 bytes for byte set",
846 );
847Ok(nw)
848 }
849850/// Returns the total number of bytes written by `write_to`.
851pub(crate) fn write_to_len(&self) -> usize {
8522 * core::mem::size_of::<u128>()
853 }
854}
855856impl core::fmt::Debugfor BitSet {
857fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
858let mut fmtd = f.debug_set();
859for b in 0u8..=255 {
860if (ByteSet { bits: *self }).contains(b) {
861 fmtd.entry(&b);
862 }
863 }
864fmtd.finish()
865 }
866}
867868#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for ByteSetIter<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field2_finish(f, "ByteSetIter",
"set", &self.set, "b", &&self.b)
}
}Debug)]
869pub(crate) struct ByteSetIter<'a> {
870 set: &'a ByteSet,
871 b: usize,
872}
873874impl<'a> Iteratorfor ByteSetIter<'a> {
875type Item = u8;
876877fn next(&mut self) -> Option<u8> {
878while self.b <= 255 {
879let b = u8::try_from(self.b).unwrap();
880self.b += 1;
881if self.set.contains(b) {
882return Some(b);
883 }
884 }
885None886 }
887}
888889#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for ByteSetRangeIter<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field2_finish(f,
"ByteSetRangeIter", "set", &self.set, "b", &&self.b)
}
}Debug)]
890pub(crate) struct ByteSetRangeIter<'a> {
891 set: &'a ByteSet,
892 b: usize,
893}
894895impl<'a> Iteratorfor ByteSetRangeIter<'a> {
896type Item = (u8, u8);
897898fn next(&mut self) -> Option<(u8, u8)> {
899let asu8 = |n: usize| u8::try_from(n).unwrap();
900while self.b <= 255 {
901let start = asu8(self.b);
902self.b += 1;
903if !self.set.contains(start) {
904continue;
905 }
906907let mut end = start;
908while self.b <= 255 && self.set.contains(asu8(self.b)) {
909 end = asu8(self.b);
910self.b += 1;
911 }
912return Some((start, end));
913 }
914None915 }
916}
917918#[cfg(all(test, feature = "alloc"))]
919mod tests {
920use alloc::{vec, vec::Vec};
921922use super::*;
923924#[test]
925fn byte_classes() {
926let mut set = ByteClassSet::empty();
927 set.set_range(b'a', b'z');
928929let classes = set.byte_classes();
930assert_eq!(classes.get(0), 0);
931assert_eq!(classes.get(1), 0);
932assert_eq!(classes.get(2), 0);
933assert_eq!(classes.get(b'a' - 1), 0);
934assert_eq!(classes.get(b'a'), 1);
935assert_eq!(classes.get(b'm'), 1);
936assert_eq!(classes.get(b'z'), 1);
937assert_eq!(classes.get(b'z' + 1), 2);
938assert_eq!(classes.get(254), 2);
939assert_eq!(classes.get(255), 2);
940941let mut set = ByteClassSet::empty();
942 set.set_range(0, 2);
943 set.set_range(4, 6);
944let classes = set.byte_classes();
945assert_eq!(classes.get(0), 0);
946assert_eq!(classes.get(1), 0);
947assert_eq!(classes.get(2), 0);
948assert_eq!(classes.get(3), 1);
949assert_eq!(classes.get(4), 2);
950assert_eq!(classes.get(5), 2);
951assert_eq!(classes.get(6), 2);
952assert_eq!(classes.get(7), 3);
953assert_eq!(classes.get(255), 3);
954 }
955956#[test]
957fn full_byte_classes() {
958let mut set = ByteClassSet::empty();
959for b in 0u8..=255 {
960 set.set_range(b, b);
961 }
962assert_eq!(set.byte_classes().alphabet_len(), 257);
963 }
964965#[test]
966fn elements_typical() {
967let mut set = ByteClassSet::empty();
968 set.set_range(b'b', b'd');
969 set.set_range(b'g', b'm');
970 set.set_range(b'z', b'z');
971let classes = set.byte_classes();
972// class 0: \x00-a
973 // class 1: b-d
974 // class 2: e-f
975 // class 3: g-m
976 // class 4: n-y
977 // class 5: z-z
978 // class 6: \x7B-\xFF
979 // class 7: EOI
980assert_eq!(classes.alphabet_len(), 8);
981982let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>();
983assert_eq!(elements.len(), 98);
984assert_eq!(elements[0], Unit::u8(b'\x00'));
985assert_eq!(elements[97], Unit::u8(b'a'));
986987let elements = classes.elements(Unit::u8(1)).collect::<Vec<_>>();
988assert_eq!(
989 elements,
990vec![Unit::u8(b'b'), Unit::u8(b'c'), Unit::u8(b'd')],
991 );
992993let elements = classes.elements(Unit::u8(2)).collect::<Vec<_>>();
994assert_eq!(elements, vec![Unit::u8(b'e'), Unit::u8(b'f')],);
995996let elements = classes.elements(Unit::u8(3)).collect::<Vec<_>>();
997assert_eq!(
998 elements,
999vec![
1000 Unit::u8(b'g'),
1001 Unit::u8(b'h'),
1002 Unit::u8(b'i'),
1003 Unit::u8(b'j'),
1004 Unit::u8(b'k'),
1005 Unit::u8(b'l'),
1006 Unit::u8(b'm'),
1007 ],
1008 );
10091010let elements = classes.elements(Unit::u8(4)).collect::<Vec<_>>();
1011assert_eq!(elements.len(), 12);
1012assert_eq!(elements[0], Unit::u8(b'n'));
1013assert_eq!(elements[11], Unit::u8(b'y'));
10141015let elements = classes.elements(Unit::u8(5)).collect::<Vec<_>>();
1016assert_eq!(elements, vec![Unit::u8(b'z')]);
10171018let elements = classes.elements(Unit::u8(6)).collect::<Vec<_>>();
1019assert_eq!(elements.len(), 133);
1020assert_eq!(elements[0], Unit::u8(b'\x7B'));
1021assert_eq!(elements[132], Unit::u8(b'\xFF'));
10221023let elements = classes.elements(Unit::eoi(7)).collect::<Vec<_>>();
1024assert_eq!(elements, vec![Unit::eoi(256)]);
1025 }
10261027#[test]
1028fn elements_singletons() {
1029let classes = ByteClasses::singletons();
1030assert_eq!(classes.alphabet_len(), 257);
10311032let elements = classes.elements(Unit::u8(b'a')).collect::<Vec<_>>();
1033assert_eq!(elements, vec![Unit::u8(b'a')]);
10341035let elements = classes.elements(Unit::eoi(5)).collect::<Vec<_>>();
1036assert_eq!(elements, vec![Unit::eoi(256)]);
1037 }
10381039#[test]
1040fn elements_empty() {
1041let classes = ByteClasses::empty();
1042assert_eq!(classes.alphabet_len(), 2);
10431044let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>();
1045assert_eq!(elements.len(), 256);
1046assert_eq!(elements[0], Unit::u8(b'\x00'));
1047assert_eq!(elements[255], Unit::u8(b'\xFF'));
10481049let elements = classes.elements(Unit::eoi(1)).collect::<Vec<_>>();
1050assert_eq!(elements, vec![Unit::eoi(256)]);
1051 }
10521053#[test]
1054fn representatives() {
1055let mut set = ByteClassSet::empty();
1056 set.set_range(b'b', b'd');
1057 set.set_range(b'g', b'm');
1058 set.set_range(b'z', b'z');
1059let classes = set.byte_classes();
10601061let got: Vec<Unit> = classes.representatives(..).collect();
1062let expected = vec![
1063 Unit::u8(b'\x00'),
1064 Unit::u8(b'b'),
1065 Unit::u8(b'e'),
1066 Unit::u8(b'g'),
1067 Unit::u8(b'n'),
1068 Unit::u8(b'z'),
1069 Unit::u8(b'\x7B'),
1070 Unit::eoi(7),
1071 ];
1072assert_eq!(expected, got);
10731074let got: Vec<Unit> = classes.representatives(..0).collect();
1075assert!(got.is_empty());
1076let got: Vec<Unit> = classes.representatives(1..1).collect();
1077assert!(got.is_empty());
1078let got: Vec<Unit> = classes.representatives(255..255).collect();
1079assert!(got.is_empty());
10801081// A weird case that is the only guaranteed to way to get an iterator
1082 // of just the EOI class by excluding all possible byte values.
1083let got: Vec<Unit> = classes
1084 .representatives((
1085 core::ops::Bound::Excluded(255),
1086 core::ops::Bound::Unbounded,
1087 ))
1088 .collect();
1089let expected = vec![Unit::eoi(7)];
1090assert_eq!(expected, got);
10911092let got: Vec<Unit> = classes.representatives(..=255).collect();
1093let expected = vec![
1094 Unit::u8(b'\x00'),
1095 Unit::u8(b'b'),
1096 Unit::u8(b'e'),
1097 Unit::u8(b'g'),
1098 Unit::u8(b'n'),
1099 Unit::u8(b'z'),
1100 Unit::u8(b'\x7B'),
1101 ];
1102assert_eq!(expected, got);
11031104let got: Vec<Unit> = classes.representatives(b'b'..=b'd').collect();
1105let expected = vec![Unit::u8(b'b')];
1106assert_eq!(expected, got);
11071108let got: Vec<Unit> = classes.representatives(b'a'..=b'd').collect();
1109let expected = vec![Unit::u8(b'a'), Unit::u8(b'b')];
1110assert_eq!(expected, got);
11111112let got: Vec<Unit> = classes.representatives(b'b'..=b'e').collect();
1113let expected = vec![Unit::u8(b'b'), Unit::u8(b'e')];
1114assert_eq!(expected, got);
11151116let got: Vec<Unit> = classes.representatives(b'A'..=b'Z').collect();
1117let expected = vec![Unit::u8(b'A')];
1118assert_eq!(expected, got);
11191120let got: Vec<Unit> = classes.representatives(b'A'..=b'z').collect();
1121let expected = vec![
1122 Unit::u8(b'A'),
1123 Unit::u8(b'b'),
1124 Unit::u8(b'e'),
1125 Unit::u8(b'g'),
1126 Unit::u8(b'n'),
1127 Unit::u8(b'z'),
1128 ];
1129assert_eq!(expected, got);
11301131let got: Vec<Unit> = classes.representatives(b'z'..).collect();
1132let expected = vec![Unit::u8(b'z'), Unit::u8(b'\x7B'), Unit::eoi(7)];
1133assert_eq!(expected, got);
11341135let got: Vec<Unit> = classes.representatives(b'z'..=0xFF).collect();
1136let expected = vec![Unit::u8(b'z'), Unit::u8(b'\x7B')];
1137assert_eq!(expected, got);
1138 }
1139}