1use crate::{
2dfa::DEAD,
3 util::{
4primitives::StateID,
5 wire::{self, DeserializeError, Endian, SerializeError},
6 },
7};
89macro_rules! err {
10 ($msg:expr) => {
11return Err(DeserializeError::generic($msg));
12 };
13}
1415// Special represents the identifiers in a DFA that correspond to "special"
16// states. If a state is one or more of the following, then it is considered
17// special:
18//
19// * dead - A non-matching state where all outgoing transitions lead back to
20// itself. There is only one of these, regardless of whether minimization
21// has run. The dead state always has an ID of 0. i.e., It is always the
22// first state in a DFA.
23// * quit - A state that is entered whenever a byte is seen that should cause
24// a DFA to give up and stop searching. This results in a MatchError::quit
25// error being returned at search time. The default configuration for a DFA
26// has no quit bytes, which means this state is unreachable by default,
27// although it is always present for reasons of implementation simplicity.
28// This state is only reachable when the caller configures the DFA to quit
29// on certain bytes. There is always exactly one of these states and it
30// is always the second state. (Its actual ID depends on the size of the
31// alphabet in dense DFAs, since state IDs are premultiplied in order to
32// allow them to be used directly as indices into the transition table.)
33// * match - An accepting state, i.e., indicative of a match. There may be
34// zero or more of these states.
35// * accelerated - A state where all of its outgoing transitions, except a
36// few, loop back to itself. These states are candidates for acceleration
37// via memchr during search. There may be zero or more of these states.
38// * start - A non-matching state that indicates where the automaton should
39// start during a search. There is always at least one starting state and
40// all are guaranteed to be non-match states. (A start state cannot be a
41// match state because the DFAs in this crate delay all matches by one byte.
42// So every search that finds a match must move through one transition to
43// some other match state, even when searching an empty string.)
44//
45// These are not mutually exclusive categories. Namely, the following
46// overlapping can occur:
47//
48// * {dead, start} - If a DFA can never lead to a match and it is minimized,
49// then it will typically compile to something where all starting IDs point
50// to the DFA's dead state.
51// * {match, accelerated} - It is possible for a match state to have the
52// majority of its transitions loop back to itself, which means it's
53// possible for a match state to be accelerated.
54// * {start, accelerated} - Similarly, it is possible for a start state to be
55// accelerated. Note that it is possible for an accelerated state to be
56// neither a match or a start state. Also note that just because both match
57// and start states overlap with accelerated states does not mean that
58// match and start states overlap with each other. In fact, they are
59// guaranteed not to overlap.
60//
61// As a special mention, every DFA always has a dead and a quit state, even
62// though from the perspective of the DFA, they are equivalent. (Indeed,
63// minimization special cases them to ensure they don't get merged.) The
64// purpose of keeping them distinct is to use the quit state as a sentinel to
65// distinguish between whether a search finished successfully without finding
66// anything or whether it gave up before finishing.
67//
68// So the main problem we want to solve here is the *fast* detection of whether
69// a state is special or not. And we also want to do this while storing as
70// little extra data as possible. AND we want to be able to quickly determine
71// which categories a state falls into above if it is special.
72//
73// We achieve this by essentially shuffling all special states to the beginning
74// of a DFA. That is, all special states appear before every other non-special
75// state. By representing special states this way, we can determine whether a
76// state is special or not by a single comparison, where special.max is the
77// identifier of the last special state in the DFA:
78//
79// if current_state <= special.max:
80// ... do something with special state
81//
82// The only thing left to do is to determine what kind of special state
83// it is. Because what we do next depends on that. Since special states
84// are typically rare, we can afford to do a bit more extra work, but we'd
85// still like this to be as fast as possible. The trick we employ here is to
86// continue shuffling states even within the special state range. Such that
87// one contiguous region corresponds to match states, another for start states
88// and then an overlapping range for accelerated states. At a high level, our
89// special state detection might look like this (for leftmost searching, where
90// we continue searching even after seeing a match):
91//
92// byte = input[offset]
93// current_state = next_state(current_state, byte)
94// offset += 1
95// if current_state <= special.max:
96// if current_state == 0:
97// # We can never leave a dead state, so this always marks the
98// # end of our search.
99// return last_match
100// if current_state == special.quit_id:
101// # A quit state means we give up. If he DFA has no quit state,
102// # then special.quit_id == 0 == dead, which is handled by the
103// # conditional above.
104// return Err(MatchError::quit { byte, offset: offset - 1 })
105// if special.min_match <= current_state <= special.max_match:
106// last_match = Some(offset)
107// if special.min_accel <= current_state <= special.max_accel:
108// offset = accelerate(input, offset)
109// last_match = Some(offset)
110// elif special.min_start <= current_state <= special.max_start:
111// offset = prefilter.find(input, offset)
112// if special.min_accel <= current_state <= special.max_accel:
113// offset = accelerate(input, offset)
114// elif special.min_accel <= current_state <= special.max_accel:
115// offset = accelerate(input, offset)
116//
117// There are some small details left out of the logic above. For example,
118// in order to accelerate a state, we need to know which bytes to search for.
119// This in turn implies some extra data we need to store in the DFA. To keep
120// things compact, we would ideally only store
121//
122// N = special.max_accel - special.min_accel + 1
123//
124// items. But state IDs are premultiplied, which means they are not contiguous.
125// So in order to take a state ID and index an array of accelerated structures,
126// we need to do:
127//
128// i = (state_id - special.min_accel) / stride
129//
130// (N.B. 'stride' is always a power of 2, so the above can be implemented via
131// '(state_id - special.min_accel) >> stride2', where 'stride2' is x in
132// 2^x=stride.)
133//
134// Moreover, some of these specialty categories may be empty. For example,
135// DFAs are not required to have any match states or any accelerated states.
136// In that case, the lower and upper bounds are both set to 0 (the dead state
137// ID) and the first `current_state == 0` check subsumes cases where the
138// ranges are empty.
139//
140// Loop unrolling, if applicable, has also been left out of the logic above.
141//
142// Graphically, the ranges look like this, where asterisks indicate ranges
143// that can be empty. Each 'x' is a state.
144//
145// quit
146// dead|
147// ||
148// xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
149// | | | | start | |
150// | |-------------| |-------| |
151// | match* | | | |
152// | | | | |
153// | |----------| | |
154// | accel* | |
155// | | |
156// | | |
157// |----------------------------|------------------------
158// special non-special*
159#[derive(#[automatically_derived]
impl ::core::clone::Clone for Special {
#[inline]
fn clone(&self) -> Special {
let _: ::core::clone::AssertParamIsClone<StateID>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for Special { }Copy, #[automatically_derived]
impl ::core::fmt::Debug for Special {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
let names: &'static _ =
&["max", "quit_id", "min_match", "max_match", "min_accel",
"max_accel", "min_start", "max_start"];
let values: &[&dyn ::core::fmt::Debug] =
&[&self.max, &self.quit_id, &self.min_match, &self.max_match,
&self.min_accel, &self.max_accel, &self.min_start,
&&self.max_start];
::core::fmt::Formatter::debug_struct_fields_finish(f, "Special",
names, values)
}
}Debug)]
160pub(crate) struct Special {
161/// The identifier of the last special state in a DFA. A state is special
162 /// if and only if its identifier is less than or equal to `max`.
163pub(crate) max: StateID,
164/// The identifier of the quit state in a DFA. (There is no analogous field
165 /// for the dead state since the dead state's ID is always zero, regardless
166 /// of state ID size.)
167pub(crate) quit_id: StateID,
168/// The identifier of the first match state.
169pub(crate) min_match: StateID,
170/// The identifier of the last match state.
171pub(crate) max_match: StateID,
172/// The identifier of the first accelerated state.
173pub(crate) min_accel: StateID,
174/// The identifier of the last accelerated state.
175pub(crate) max_accel: StateID,
176/// The identifier of the first start state.
177pub(crate) min_start: StateID,
178/// The identifier of the last start state.
179pub(crate) max_start: StateID,
180}
181182impl Special {
183/// Creates a new set of special ranges for a DFA. All ranges are initially
184 /// set to only contain the dead state. This is interpreted as an empty
185 /// range.
186#[cfg(feature = "dfa-build")]
187pub(crate) fn new() -> Special {
188Special {
189 max: DEAD,
190 quit_id: DEAD,
191 min_match: DEAD,
192 max_match: DEAD,
193 min_accel: DEAD,
194 max_accel: DEAD,
195 min_start: DEAD,
196 max_start: DEAD,
197 }
198 }
199200/// Remaps all of the special state identifiers using the function given.
201#[cfg(feature = "dfa-build")]
202pub(crate) fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special {
203Special {
204 max: map(self.max),
205 quit_id: map(self.quit_id),
206 min_match: map(self.min_match),
207 max_match: map(self.max_match),
208 min_accel: map(self.min_accel),
209 max_accel: map(self.max_accel),
210 min_start: map(self.min_start),
211 max_start: map(self.max_start),
212 }
213 }
214215/// Deserialize the given bytes into special state ranges. If the slice
216 /// given is not big enough, then this returns an error. Similarly, if
217 /// any of the expected invariants around special state ranges aren't
218 /// upheld, an error is returned. Note that this does not guarantee that
219 /// the information returned is correct.
220 ///
221 /// Upon success, this returns the number of bytes read in addition to the
222 /// special state IDs themselves.
223pub(crate) fn from_bytes(
224mut slice: &[u8],
225 ) -> Result<(Special, usize), DeserializeError> {
226 wire::check_slice_len(slice, 8 * StateID::SIZE, "special states")?;
227228let mut nread = 0;
229let mut read_id = |what| -> Result<StateID, DeserializeError> {
230let (id, nr) = wire::try_read_state_id(slice, what)?;
231nread += nr;
232slice = &slice[StateID::SIZE..];
233Ok(id)
234 };
235236let max = read_id("special max id")?;
237let quit_id = read_id("special quit id")?;
238let min_match = read_id("special min match id")?;
239let max_match = read_id("special max match id")?;
240let min_accel = read_id("special min accel id")?;
241let max_accel = read_id("special max accel id")?;
242let min_start = read_id("special min start id")?;
243let max_start = read_id("special max start id")?;
244245let special = Special {
246max,
247quit_id,
248min_match,
249max_match,
250min_accel,
251max_accel,
252min_start,
253max_start,
254 };
255special.validate()?;
256match (&nread, &special.write_to_len()) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val, &*right_val,
::core::option::Option::None);
}
}
};assert_eq!(nread, special.write_to_len());
257Ok((special, nread))
258 }
259260/// Validate that the information describing special states satisfies
261 /// all known invariants.
262pub(crate) fn validate(&self) -> Result<(), DeserializeError> {
263// Check that both ends of the range are DEAD or neither are.
264if self.min_match == DEAD && self.max_match != DEAD {
265return Err(DeserializeError::generic("min_match is DEAD, but max_match is not"));err!("min_match is DEAD, but max_match is not");
266 }
267if self.min_match != DEAD && self.max_match == DEAD {
268return Err(DeserializeError::generic("max_match is DEAD, but min_match is not"));err!("max_match is DEAD, but min_match is not");
269 }
270if self.min_accel == DEAD && self.max_accel != DEAD {
271return Err(DeserializeError::generic("min_accel is DEAD, but max_accel is not"));err!("min_accel is DEAD, but max_accel is not");
272 }
273if self.min_accel != DEAD && self.max_accel == DEAD {
274return Err(DeserializeError::generic("max_accel is DEAD, but min_accel is not"));err!("max_accel is DEAD, but min_accel is not");
275 }
276if self.min_start == DEAD && self.max_start != DEAD {
277return Err(DeserializeError::generic("min_start is DEAD, but max_start is not"));err!("min_start is DEAD, but max_start is not");
278 }
279if self.min_start != DEAD && self.max_start == DEAD {
280return Err(DeserializeError::generic("max_start is DEAD, but min_start is not"));err!("max_start is DEAD, but min_start is not");
281 }
282283// Check that ranges are well formed.
284if self.min_match > self.max_match {
285return Err(DeserializeError::generic("min_match should not be greater than max_match"));err!("min_match should not be greater than max_match");
286 }
287if self.min_accel > self.max_accel {
288return Err(DeserializeError::generic("min_accel should not be greater than max_accel"));err!("min_accel should not be greater than max_accel");
289 }
290if self.min_start > self.max_start {
291return Err(DeserializeError::generic("min_start should not be greater than max_start"));err!("min_start should not be greater than max_start");
292 }
293294// Check that ranges are ordered with respect to one another.
295if self.matches() && self.quit_id >= self.min_match {
296return Err(DeserializeError::generic("quit_id should not be greater than min_match"));err!("quit_id should not be greater than min_match");
297 }
298if self.accels() && self.quit_id >= self.min_accel {
299return Err(DeserializeError::generic("quit_id should not be greater than min_accel"));err!("quit_id should not be greater than min_accel");
300 }
301if self.starts() && self.quit_id >= self.min_start {
302return Err(DeserializeError::generic("quit_id should not be greater than min_start"));err!("quit_id should not be greater than min_start");
303 }
304if self.matches() && self.accels() && self.min_accel < self.min_match {
305return Err(DeserializeError::generic("min_match should not be greater than min_accel"));err!("min_match should not be greater than min_accel");
306 }
307if self.matches() && self.starts() && self.min_start < self.min_match {
308return Err(DeserializeError::generic("min_match should not be greater than min_start"));err!("min_match should not be greater than min_start");
309 }
310if self.accels() && self.starts() && self.min_start < self.min_accel {
311return Err(DeserializeError::generic("min_accel should not be greater than min_start"));err!("min_accel should not be greater than min_start");
312 }
313314// Check that max is at least as big as everything else.
315if self.max < self.quit_id {
316return Err(DeserializeError::generic("quit_id should not be greater than max"));err!("quit_id should not be greater than max");
317 }
318if self.max < self.max_match {
319return Err(DeserializeError::generic("max_match should not be greater than max"));err!("max_match should not be greater than max");
320 }
321if self.max < self.max_accel {
322return Err(DeserializeError::generic("max_accel should not be greater than max"));err!("max_accel should not be greater than max");
323 }
324if self.max < self.max_start {
325return Err(DeserializeError::generic("max_start should not be greater than max"));err!("max_start should not be greater than max");
326 }
327328Ok(())
329 }
330331/// Validate that the special state information is compatible with the
332 /// given state len.
333pub(crate) fn validate_state_len(
334&self,
335 len: usize,
336 stride2: usize,
337 ) -> Result<(), DeserializeError> {
338// We assume that 'validate' has already passed, so we know that 'max'
339 // is truly the max. So all we need to check is that the max state ID
340 // is less than the state ID len. The max legal value here is len-1,
341 // which occurs when there are no non-special states.
342if (self.max.as_usize() >> stride2) >= len {
343return Err(DeserializeError::generic("max should not be greater than or equal to state length"));err!("max should not be greater than or equal to state length");
344 }
345Ok(())
346 }
347348/// Write the IDs and ranges for special states to the given byte buffer.
349 /// The buffer given must have enough room to store all data, otherwise
350 /// this will return an error. The number of bytes written is returned
351 /// on success. The number of bytes written is guaranteed to be a multiple
352 /// of 8.
353pub(crate) fn write_to<E: Endian>(
354&self,
355 dst: &mut [u8],
356 ) -> Result<usize, SerializeError> {
357use crate::util::wire::write_state_idas write;
358359if dst.len() < self.write_to_len() {
360return Err(SerializeError::buffer_too_small("special state ids"));
361 }
362363let mut nwrite = 0;
364nwrite += write::<E>(self.max, &mut dst[nwrite..]);
365nwrite += write::<E>(self.quit_id, &mut dst[nwrite..]);
366nwrite += write::<E>(self.min_match, &mut dst[nwrite..]);
367nwrite += write::<E>(self.max_match, &mut dst[nwrite..]);
368nwrite += write::<E>(self.min_accel, &mut dst[nwrite..]);
369nwrite += write::<E>(self.max_accel, &mut dst[nwrite..]);
370nwrite += write::<E>(self.min_start, &mut dst[nwrite..]);
371nwrite += write::<E>(self.max_start, &mut dst[nwrite..]);
372373match (&self.write_to_len(), &nwrite) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val, &*right_val,
::core::option::Option::Some(format_args!("expected to write certain number of bytes")));
}
}
};assert_eq!(
374self.write_to_len(),
375 nwrite,
376"expected to write certain number of bytes",
377 );
378match (&(nwrite % 8), &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val, &*right_val,
::core::option::Option::Some(format_args!("expected to write multiple of 8 bytes for special states")));
}
}
};assert_eq!(
379 nwrite % 8,
3800,
381"expected to write multiple of 8 bytes for special states",
382 );
383Ok(nwrite)
384 }
385386/// Returns the total number of bytes written by `write_to`.
387pub(crate) fn write_to_len(&self) -> usize {
3888 * StateID::SIZE389 }
390391/// Sets the maximum special state ID based on the current values. This
392 /// should be used once all possible state IDs are set.
393#[cfg(feature = "dfa-build")]
394pub(crate) fn set_max(&mut self) {
395use core::cmp::max;
396self.max = max(
397self.quit_id,
398max(self.max_match, max(self.max_accel, self.max_start)),
399 );
400 }
401402/// Sets the maximum special state ID such that starting states are not
403 /// considered "special." This also marks the min/max starting states as
404 /// DEAD such that 'is_start_state' always returns false, even if the state
405 /// is actually a starting state.
406 ///
407 /// This is useful when there is no prefilter set. It will avoid
408 /// ping-ponging between the hot path in the DFA search code and the start
409 /// state handling code, which is typically only useful for executing a
410 /// prefilter.
411#[cfg(feature = "dfa-build")]
412pub(crate) fn set_no_special_start_states(&mut self) {
413use core::cmp::max;
414self.max = max(self.quit_id, max(self.max_match, self.max_accel));
415self.min_start = DEAD;
416self.max_start = DEAD;
417 }
418419/// Returns true if and only if the given state ID is a special state.
420#[inline]
421pub(crate) fn is_special_state(&self, id: StateID) -> bool {
422id <= self.max
423 }
424425/// Returns true if and only if the given state ID is a dead state.
426#[inline]
427pub(crate) fn is_dead_state(&self, id: StateID) -> bool {
428id == DEAD429 }
430431/// Returns true if and only if the given state ID is a quit state.
432#[inline]
433pub(crate) fn is_quit_state(&self, id: StateID) -> bool {
434 !self.is_dead_state(id) && self.quit_id == id435 }
436437/// Returns true if and only if the given state ID is a match state.
438#[inline]
439pub(crate) fn is_match_state(&self, id: StateID) -> bool {
440 !self.is_dead_state(id) && self.min_match <= id && id <= self.max_match
441 }
442443/// Returns true if and only if the given state ID is an accel state.
444#[inline]
445pub(crate) fn is_accel_state(&self, id: StateID) -> bool {
446 !self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel
447 }
448449/// Returns true if and only if the given state ID is a start state.
450#[inline]
451pub(crate) fn is_start_state(&self, id: StateID) -> bool {
452 !self.is_dead_state(id) && self.min_start <= id && id <= self.max_start
453 }
454455/// Returns the total number of match states for a dense table based DFA.
456#[inline]
457pub(crate) fn match_len(&self, stride: usize) -> usize {
458if self.matches() {
459 (self.max_match.as_usize() - self.min_match.as_usize() + stride)
460 / stride461 } else {
4620
463}
464 }
465466/// Returns true if and only if there is at least one match state.
467#[inline]
468pub(crate) fn matches(&self) -> bool {
469self.min_match != DEAD470 }
471472/// Returns the total number of accel states.
473#[cfg(feature = "dfa-build")]
474pub(crate) fn accel_len(&self, stride: usize) -> usize {
475if self.accels() {
476 (self.max_accel.as_usize() - self.min_accel.as_usize() + stride)
477 / stride478 } else {
4790
480}
481 }
482483/// Returns true if and only if there is at least one accel state.
484#[inline]
485pub(crate) fn accels(&self) -> bool {
486self.min_accel != DEAD487 }
488489/// Returns true if and only if there is at least one start state.
490#[inline]
491pub(crate) fn starts(&self) -> bool {
492self.min_start != DEAD493 }
494}