1use core::{cell::RefCell, fmt, mem};
23use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec};
45use crate::{
6 dfa::{automaton::Automaton, dense, DEAD},
7 util::{
8alphabet,
9 primitives::{PatternID, StateID},
10 },
11};
1213/// An implementation of Hopcroft's algorithm for minimizing DFAs.
14///
15/// The algorithm implemented here is mostly taken from Wikipedia:
16/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
17///
18/// This code has had some light optimization attention paid to it,
19/// particularly in the form of reducing allocation as much as possible.
20/// However, it is still generally slow. Future optimization work should
21/// probably focus on the bigger picture rather than micro-optimizations. For
22/// example:
23///
24/// 1. Figure out how to more intelligently create initial partitions. That is,
25/// Hopcroft's algorithm starts by creating two partitions of DFA states
26/// that are known to NOT be equivalent: match states and non-match states.
27/// The algorithm proceeds by progressively refining these partitions into
28/// smaller partitions. If we could start with more partitions, then we
29/// could reduce the amount of work that Hopcroft's algorithm needs to do.
30/// 2. For every partition that we visit, we find all incoming transitions to
31/// every state in the partition for *every* element in the alphabet. (This
32/// is why using byte classes can significantly decrease minimization times,
33/// since byte classes shrink the alphabet.) This is quite costly and there
34/// is perhaps some redundant work being performed depending on the specific
35/// states in the set. For example, we might be able to only visit some
36/// elements of the alphabet based on the transitions.
37/// 3. Move parts of minimization into determinization. If minimization has
38/// fewer states to deal with, then it should run faster. A prime example
39/// of this might be large Unicode classes, which are generated in way that
40/// can create a lot of redundant states. (Some work has been done on this
41/// point during NFA compilation via the algorithm described in the
42/// "Incremental Construction of MinimalAcyclic Finite-State Automata"
43/// paper.)
44pub(crate) struct Minimizer<'a> {
45 dfa: &'a mut dense::OwnedDFA,
46 in_transitions: Vec<Vec<Vec<StateID>>>,
47 partitions: Vec<StateSet>,
48 waiting: Vec<StateSet>,
49}
5051impl<'a> fmt::Debugfor Minimizer<'a> {
52fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
53f.debug_struct("Minimizer")
54 .field("dfa", &self.dfa)
55 .field("in_transitions", &self.in_transitions)
56 .field("partitions", &self.partitions)
57 .field("waiting", &self.waiting)
58 .finish()
59 }
60}
6162/// A set of states. A state set makes up a single partition in Hopcroft's
63/// algorithm.
64///
65/// It is represented by an ordered set of state identifiers. We use shared
66/// ownership so that a single state set can be in both the set of partitions
67/// and in the set of waiting sets simultaneously without an additional
68/// allocation. Generally, once a state set is built, it becomes immutable.
69///
70/// We use this representation because it avoids the overhead of more
71/// traditional set data structures (HashSet/BTreeSet), and also because
72/// computing intersection/subtraction on this representation is especially
73/// fast.
74#[derive(#[automatically_derived]
impl ::core::clone::Clone for StateSet {
#[inline]
fn clone(&self) -> StateSet {
StateSet { ids: ::core::clone::Clone::clone(&self.ids) }
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for StateSet {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field1_finish(f, "StateSet",
"ids", &&self.ids)
}
}Debug, #[automatically_derived]
impl ::core::cmp::Eq for StateSet {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<Rc<RefCell<Vec<StateID>>>>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for StateSet {
#[inline]
fn eq(&self, other: &StateSet) -> bool { self.ids == other.ids }
}PartialEq, #[automatically_derived]
impl ::core::cmp::PartialOrd for StateSet {
#[inline]
fn partial_cmp(&self, other: &StateSet)
-> ::core::option::Option<::core::cmp::Ordering> {
::core::cmp::PartialOrd::partial_cmp(&self.ids, &other.ids)
}
}PartialOrd, #[automatically_derived]
impl ::core::cmp::Ord for StateSet {
#[inline]
fn cmp(&self, other: &StateSet) -> ::core::cmp::Ordering {
::core::cmp::Ord::cmp(&self.ids, &other.ids)
}
}Ord)]
75struct StateSet {
76 ids: Rc<RefCell<Vec<StateID>>>,
77}
7879impl<'a> Minimizer<'a> {
80pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> {
81let in_transitions = Minimizer::incoming_transitions(dfa);
82let partitions = Minimizer::initial_partitions(dfa);
83let waiting = partitions.clone();
84Minimizer { dfa, in_transitions, partitions, waiting }
85 }
8687pub fn run(mut self) {
88let stride2 = self.dfa.stride2();
89let as_state_id = |index: usize| -> StateID {
90StateID::new(index << stride2).unwrap()
91 };
92let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 };
9394let mut incoming = StateSet::empty();
95let mut scratch1 = StateSet::empty();
96let mut scratch2 = StateSet::empty();
97let mut newparts = ::alloc::vec::Vec::new()vec![];
9899// This loop is basically Hopcroft's algorithm. Everything else is just
100 // shuffling data around to fit our representation.
101while let Some(set) = self.waiting.pop() {
102for b in self.dfa.byte_classes().iter() {
103self.find_incoming_to(b, &set, &mut incoming);
104// If incoming is empty, then the intersection with any other
105 // set must also be empty. So 'newparts' just ends up being
106 // 'self.partitions'. So there's no need to go through the loop
107 // below.
108 //
109 // This actually turns out to be rather large optimization. On
110 // the order of making minimization 4-5x faster. It's likely
111 // that the vast majority of all states have very few incoming
112 // transitions.
113if incoming.is_empty() {
114continue;
115 }
116117for p in 0..self.partitions.len() {
118self.partitions[p].intersection(&incoming, &mut scratch1);
119if scratch1.is_empty() {
120 newparts.push(self.partitions[p].clone());
121continue;
122 }
123124self.partitions[p].subtract(&incoming, &mut scratch2);
125if scratch2.is_empty() {
126 newparts.push(self.partitions[p].clone());
127continue;
128 }
129130let (x, y) =
131 (scratch1.deep_clone(), scratch2.deep_clone());
132 newparts.push(x.clone());
133 newparts.push(y.clone());
134match self.find_waiting(&self.partitions[p]) {
135Some(i) => {
136self.waiting[i] = x;
137self.waiting.push(y);
138 }
139None => {
140if x.len() <= y.len() {
141self.waiting.push(x);
142 } else {
143self.waiting.push(y);
144 }
145 }
146 }
147 }
148 newparts = mem::replace(&mut self.partitions, newparts);
149 newparts.clear();
150 }
151 }
152153// At this point, we now have a minimal partitioning of states, where
154 // each partition is an equivalence class of DFA states. Now we need to
155 // use this partitioning to update the DFA to only contain one state for
156 // each partition.
157158 // Create a map from DFA state ID to the representative ID of the
159 // equivalence class to which it belongs. The representative ID of an
160 // equivalence class of states is the minimum ID in that class.
161let mut state_to_part = ::alloc::vec::from_elem(DEAD, self.dfa.state_len())vec![DEAD; self.dfa.state_len()];
162for p in &self.partitions {
163 p.iter(|id| state_to_part[as_index(id)] = p.min());
164 }
165166// Generate a new contiguous sequence of IDs for minimal states, and
167 // create a map from equivalence IDs to the new IDs. Thus, the new
168 // minimal ID of *any* state in the unminimized DFA can be obtained
169 // with minimals_ids[state_to_part[old_id]].
170let mut minimal_ids = ::alloc::vec::from_elem(DEAD, self.dfa.state_len())vec![DEAD; self.dfa.state_len()];
171let mut new_index = 0;
172for state in self.dfa.states() {
173if state_to_part[as_index(state.id())] == state.id() {
174 minimal_ids[as_index(state.id())] = as_state_id(new_index);
175 new_index += 1;
176 }
177 }
178// The total number of states in the minimal DFA.
179let minimal_count = new_index;
180// Convenience function for remapping state IDs. This takes an old ID,
181 // looks up its Hopcroft partition and then maps that to the new ID
182 // range.
183let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])];
184185// Re-map this DFA in place such that the only states remaining
186 // correspond to the representative states of every equivalence class.
187for id in (0..self.dfa.state_len()).map(as_state_id) {
188// If this state isn't a representative for an equivalence class,
189 // then we skip it since it won't appear in the minimal DFA.
190if state_to_part[as_index(id)] != id {
191continue;
192 }
193self.dfa.remap_state(id, remap);
194self.dfa.swap_states(id, minimal_ids[as_index(id)]);
195 }
196// Trim off all unused states from the pre-minimized DFA. This
197 // represents all states that were merged into a non-singleton
198 // equivalence class of states, and appeared after the first state
199 // in each such class. (Because the state with the smallest ID in each
200 // equivalence class is its representative ID.)
201self.dfa.truncate_states(minimal_count);
202203// Update the new start states, which is now just the minimal ID of
204 // whatever state the old start state was collapsed into. Also, we
205 // collect everything before-hand to work around the borrow checker.
206 // We're already allocating so much that this is probably fine. If this
207 // turns out to be costly, then I guess add a `starts_mut` iterator.
208let starts: Vec<_> = self.dfa.starts().collect();
209for (old_start_id, anchored, start_type) in starts {
210self.dfa.set_start_state(
211 anchored,
212 start_type,
213 remap(old_start_id),
214 );
215 }
216217// Update the match state pattern ID list for multi-regexes. All we
218 // need to do is remap the match state IDs. The pattern ID lists are
219 // always the same as they were since match states with distinct
220 // pattern ID lists are always considered distinct states.
221let mut pmap = BTreeMap::new();
222for (match_id, pattern_ids) in self.dfa.pattern_map() {
223let new_id = remap(match_id);
224 pmap.insert(new_id, pattern_ids);
225 }
226// This unwrap is OK because minimization never increases the number of
227 // match states or patterns in those match states. Since minimization
228 // runs after the pattern map has already been set at least once, we
229 // know that our match states cannot error.
230self.dfa.set_pattern_map(&pmap).unwrap();
231232// In order to update the ID of the maximum match state, we need to
233 // find the maximum ID among all of the match states in the minimized
234 // DFA. This is not necessarily the new ID of the unminimized maximum
235 // match state, since that could have been collapsed with a much
236 // earlier match state. Therefore, to find the new max match state,
237 // we iterate over all previous match states, find their corresponding
238 // new minimal ID, and take the maximum of those.
239let old = self.dfa.special().clone();
240let new = self.dfa.special_mut();
241// ... but only remap if we had match states.
242if old.matches() {
243new.min_match = StateID::MAX;
244new.max_match = StateID::ZERO;
245for i in as_index(old.min_match)..=as_index(old.max_match) {
246let new_id = remap(as_state_id(i));
247if new_id < new.min_match {
248 new.min_match = new_id;
249 }
250if new_id > new.max_match {
251 new.max_match = new_id;
252 }
253 }
254 }
255// ... same, but for start states.
256if old.starts() {
257new.min_start = StateID::MAX;
258new.max_start = StateID::ZERO;
259for i in as_index(old.min_start)..=as_index(old.max_start) {
260let new_id = remap(as_state_id(i));
261if new_id == DEAD {
262continue;
263 }
264if new_id < new.min_start {
265 new.min_start = new_id;
266 }
267if new_id > new.max_start {
268 new.max_start = new_id;
269 }
270 }
271if new.max_start == DEAD {
272new.min_start = DEAD;
273 }
274 }
275new.quit_id = remap(new.quit_id);
276new.set_max();
277 }
278279fn find_waiting(&self, set: &StateSet) -> Option<usize> {
280self.waiting.iter().position(|s| s == set)
281 }
282283fn find_incoming_to(
284&self,
285 b: alphabet::Unit,
286 set: &StateSet,
287 incoming: &mut StateSet,
288 ) {
289incoming.clear();
290set.iter(|id| {
291for &inid in
292&self.in_transitions[self.dfa.to_index(id)][b.as_usize()]
293 {
294 incoming.add(inid);
295 }
296 });
297incoming.canonicalize();
298 }
299300fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> {
301// For match states, we know that two match states with different
302 // pattern ID lists will *always* be distinct, so we can partition them
303 // initially based on that.
304let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new();
305let mut is_quit = StateSet::empty();
306let mut no_match = StateSet::empty();
307for state in dfa.states() {
308if dfa.is_match_state(state.id()) {
309let mut pids = ::alloc::vec::Vec::new()vec![];
310for i in 0..dfa.match_len(state.id()) {
311 pids.push(dfa.match_pattern(state.id(), i));
312 }
313 matching
314 .entry(pids)
315 .or_insert(StateSet::empty())
316 .add(state.id());
317 } else if dfa.is_quit_state(state.id()) {
318 is_quit.add(state.id());
319 } else {
320 no_match.add(state.id());
321 }
322 }
323324let mut sets: Vec<StateSet> =
325matching.into_iter().map(|(_, set)| set).collect();
326sets.push(no_match);
327sets.push(is_quit);
328sets329 }
330331fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> {
332let mut incoming = ::alloc::vec::Vec::new()vec![];
333for _ in dfa.states() {
334 incoming.push(::alloc::vec::from_elem(::alloc::vec::Vec::new(), dfa.alphabet_len())vec![vec![]; dfa.alphabet_len()]);
335 }
336for state in dfa.states() {
337for (b, next) in state.transitions() {
338 incoming[dfa.to_index(next)][b.as_usize()].push(state.id());
339 }
340 }
341incoming342 }
343}
344345impl StateSet {
346fn empty() -> StateSet {
347StateSet { ids: Rc::new(RefCell::new(::alloc::vec::Vec::new()vec![])) }
348 }
349350fn add(&mut self, id: StateID) {
351self.ids.borrow_mut().push(id);
352 }
353354fn min(&self) -> StateID {
355self.ids.borrow()[0]
356 }
357358fn canonicalize(&mut self) {
359self.ids.borrow_mut().sort();
360self.ids.borrow_mut().dedup();
361 }
362363fn clear(&mut self) {
364self.ids.borrow_mut().clear();
365 }
366367fn len(&self) -> usize {
368self.ids.borrow().len()
369 }
370371fn is_empty(&self) -> bool {
372self.len() == 0
373}
374375fn deep_clone(&self) -> StateSet {
376let ids = self.ids.borrow().iter().cloned().collect();
377StateSet { ids: Rc::new(RefCell::new(ids)) }
378 }
379380fn iter<F: FnMut(StateID)>(&self, mut f: F) {
381for &id in self.ids.borrow().iter() {
382 f(id);
383 }
384 }
385386fn intersection(&self, other: &StateSet, dest: &mut StateSet) {
387dest.clear();
388if self.is_empty() || other.is_empty() {
389return;
390 }
391392let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
393let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
394let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
395loop {
396if a == b {
397dest.add(a);
398a = match ita.next() {
399None => break,
400Some(a) => a,
401 };
402b = match itb.next() {
403None => break,
404Some(b) => b,
405 };
406 } else if a < b {
407a = match ita.next() {
408None => break,
409Some(a) => a,
410 };
411 } else {
412b = match itb.next() {
413None => break,
414Some(b) => b,
415 };
416 }
417 }
418 }
419420fn subtract(&self, other: &StateSet, dest: &mut StateSet) {
421dest.clear();
422if self.is_empty() || other.is_empty() {
423self.iter(|s| dest.add(s));
424return;
425 }
426427let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
428let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
429let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
430loop {
431if a == b {
432a = match ita.next() {
433None => break,
434Some(a) => a,
435 };
436b = match itb.next() {
437None => {
438dest.add(a);
439break;
440 }
441Some(b) => b,
442 };
443 } else if a < b {
444dest.add(a);
445a = match ita.next() {
446None => break,
447Some(a) => a,
448 };
449 } else {
450b = match itb.next() {
451None => {
452dest.add(a);
453break;
454 }
455Some(b) => b,
456 };
457 }
458 }
459for a in ita {
460 dest.add(a);
461 }
462 }
463}