1// This module contains a couple simple and purpose built hash maps. The key
2// trade off they make is that they serve as caches rather than true maps. That
3// is, inserting a new entry may cause eviction of another entry. This gives
4// us two things. First, there's less overhead associated with inserts and
5// lookups. Secondly, it lets us control our memory usage.
6//
7// These maps are used in some fairly hot code when generating NFA states for
8// large Unicode character classes.
9//
10// Instead of exposing a rich hashmap entry API, we just permit the caller to
11// produce a hash of the key directly. The hash can then be reused for both
12// lookups and insertions at the cost of leaking abstraction a bit. But these
13// are for internal use only, so it's fine.
14//
15// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
16// (almost) minimal DFA for large Unicode character classes in linear time.
17// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
18// NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
19// since there's a bit more expense in the reverse direction.)
20//
21// The Utf8SuffixMap is used when compiling large Unicode character classes for
22// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
23// construction of UTF-8 automata by caching common suffixes. This doesn't
24// get the same space savings as Daciuk's algorithm, but it's basically as
25// fast as the naive approach and typically winds up using less memory (since
26// it generates smaller NFAs) despite the presence of the cache.
27//
28// These maps effectively represent caching mechanisms for sparse and
29// byte-range NFA states, respectively. The former represents a single NFA
30// state with many transitions of equivalent priority while the latter
31// represents a single NFA state with a single transition. (Neither state ever
32// has or is an epsilon transition.) Thus, they have different key types. It's
33// likely we could make one generic map, but the machinery didn't seem worth
34// it. They are simple enough.
3536use alloc::{vec, vec::Vec};
3738use crate::{
39nfa::thompson::Transition,
40 util::{
41 int::{Usize, U64},
42primitives::StateID,
43 },
44};
4546// Basic FNV-1a hash constants as described in:
47// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
48const PRIME: u64 = 1099511628211;
49const INIT: u64 = 14695981039346656037;
5051/// A bounded hash map where the key is a sequence of NFA transitions and the
52/// value is a pre-existing NFA state ID.
53///
54/// std's hashmap can be used for this, however, this map has two important
55/// advantages. Firstly, it has lower overhead. Secondly, it permits us to
56/// control our memory usage by limited the number of slots. In general, the
57/// cost here is that this map acts as a cache. That is, inserting a new entry
58/// may remove an old entry. We are okay with this, since it does not impact
59/// correctness in the cases where it is used. The only effect that dropping
60/// states from the cache has is that the resulting NFA generated may be bigger
61/// than it otherwise would be.
62///
63/// This improves benchmarks that compile large Unicode character classes,
64/// since it makes the generation of (almost) minimal UTF-8 automaton faster.
65/// Specifically, one could observe the difference with std's hashmap via
66/// something like the following benchmark:
67///
68/// hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'"
69///
70/// But to observe that difference, you'd have to modify the code to use
71/// std's hashmap.
72///
73/// It is quite possible that there is a better way to approach this problem.
74/// For example, if there happens to be a very common state that collides with
75/// a lot of less frequent states, then we could wind up with very poor caching
76/// behavior. Alas, the effectiveness of this cache has not been measured.
77/// Instead, ad hoc experiments suggest that it is "good enough." Additional
78/// smarts (such as an LRU eviction policy) have to be weighed against the
79/// amount of extra time they cost.
80#[derive(#[automatically_derived]
impl ::core::clone::Clone for Utf8BoundedMap {
#[inline]
fn clone(&self) -> Utf8BoundedMap {
Utf8BoundedMap {
version: ::core::clone::Clone::clone(&self.version),
capacity: ::core::clone::Clone::clone(&self.capacity),
map: ::core::clone::Clone::clone(&self.map),
}
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for Utf8BoundedMap {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field3_finish(f,
"Utf8BoundedMap", "version", &self.version, "capacity",
&self.capacity, "map", &&self.map)
}
}Debug)]
81pub struct Utf8BoundedMap {
82/// The current version of this map. Only entries with matching versions
83 /// are considered during lookups. If an entry is found with a mismatched
84 /// version, then the map behaves as if the entry does not exist.
85 ///
86 /// This makes it possible to clear the map by simply incrementing the
87 /// version number instead of actually deallocating any storage.
88version: u16,
89/// The total number of entries this map can store.
90capacity: usize,
91/// The actual entries, keyed by hash. Collisions between different states
92 /// result in the old state being dropped.
93map: Vec<Utf8BoundedEntry>,
94}
9596/// An entry in this map.
97#[derive(#[automatically_derived]
impl ::core::clone::Clone for Utf8BoundedEntry {
#[inline]
fn clone(&self) -> Utf8BoundedEntry {
Utf8BoundedEntry {
version: ::core::clone::Clone::clone(&self.version),
key: ::core::clone::Clone::clone(&self.key),
val: ::core::clone::Clone::clone(&self.val),
}
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for Utf8BoundedEntry {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field3_finish(f,
"Utf8BoundedEntry", "version", &self.version, "key", &self.key,
"val", &&self.val)
}
}Debug, #[automatically_derived]
impl ::core::default::Default for Utf8BoundedEntry {
#[inline]
fn default() -> Utf8BoundedEntry {
Utf8BoundedEntry {
version: ::core::default::Default::default(),
key: ::core::default::Default::default(),
val: ::core::default::Default::default(),
}
}
}Default)]
98struct Utf8BoundedEntry {
99/// The version of the map used to produce this entry. If this entry's
100 /// version does not match the current version of the map, then the map
101 /// should behave as if this entry does not exist.
102version: u16,
103/// The key, which is a sorted sequence of non-overlapping NFA transitions.
104key: Vec<Transition>,
105/// The state ID corresponding to the state containing the transitions in
106 /// this entry.
107val: StateID,
108}
109110impl Utf8BoundedMap {
111/// Create a new bounded map with the given capacity. The map will never
112 /// grow beyond the given size.
113 ///
114 /// Note that this does not allocate. Instead, callers must call `clear`
115 /// before using this map. `clear` will allocate space if necessary.
116 ///
117 /// This avoids the need to pay for the allocation of this map when
118 /// compiling regexes that lack large Unicode character classes.
119pub fn new(capacity: usize) -> Utf8BoundedMap {
120if !(capacity > 0) {
::core::panicking::panic("assertion failed: capacity > 0")
};assert!(capacity > 0);
121Utf8BoundedMap { version: 0, capacity, map: ::alloc::vec::Vec::new()vec![] }
122 }
123124/// Clear this map of all entries, but permit the reuse of allocation
125 /// if possible.
126 ///
127 /// This must be called before the map can be used.
128pub fn clear(&mut self) {
129if self.map.is_empty() {
130self.map = ::alloc::vec::from_elem(Utf8BoundedEntry::default(), self.capacity)vec![Utf8BoundedEntry::default(); self.capacity];
131 } else {
132self.version = self.version.wrapping_add(1);
133// If we loop back to version 0, then we forcefully clear the
134 // entire map. Otherwise, it might be possible to incorrectly
135 // match entries used to generate other NFAs.
136if self.version == 0 {
137self.map = ::alloc::vec::from_elem(Utf8BoundedEntry::default(), self.capacity)vec![Utf8BoundedEntry::default(); self.capacity];
138 }
139 }
140 }
141142/// Return a hash of the given transitions.
143pub fn hash(&self, key: &[Transition]) -> usize {
144let mut h = INIT;
145for t in key {
146 h = (h ^ u64::from(t.start)).wrapping_mul(PRIME);
147 h = (h ^ u64::from(t.end)).wrapping_mul(PRIME);
148 h = (h ^ t.next.as_u64()).wrapping_mul(PRIME);
149 }
150 (h % self.map.len().as_u64()).as_usize()
151 }
152153/// Retrieve the cached state ID corresponding to the given key. The hash
154 /// given must have been computed with `hash` using the same key value.
155 ///
156 /// If there is no cached state with the given transitions, then None is
157 /// returned.
158pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
159let entry = &self.map[hash];
160if entry.version != self.version {
161return None;
162 }
163// There may be a hash collision, so we need to confirm real equality.
164if entry.key != key {
165return None;
166 }
167Some(entry.val)
168 }
169170/// Add a cached state to this map with the given key. Callers should
171 /// ensure that `state_id` points to a state that contains precisely the
172 /// NFA transitions given.
173 ///
174 /// `hash` must have been computed using the `hash` method with the same
175 /// key.
176pub fn set(
177&mut self,
178 key: Vec<Transition>,
179 hash: usize,
180 state_id: StateID,
181 ) {
182self.map[hash] =
183Utf8BoundedEntry { version: self.version, key, val: state_id };
184 }
185}
186187/// A cache of suffixes used to modestly compress UTF-8 automata for large
188/// Unicode character classes.
189#[derive(#[automatically_derived]
impl ::core::clone::Clone for Utf8SuffixMap {
#[inline]
fn clone(&self) -> Utf8SuffixMap {
Utf8SuffixMap {
version: ::core::clone::Clone::clone(&self.version),
capacity: ::core::clone::Clone::clone(&self.capacity),
map: ::core::clone::Clone::clone(&self.map),
}
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for Utf8SuffixMap {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field3_finish(f, "Utf8SuffixMap",
"version", &self.version, "capacity", &self.capacity, "map",
&&self.map)
}
}Debug)]
190pub struct Utf8SuffixMap {
191/// The current version of this map. Only entries with matching versions
192 /// are considered during lookups. If an entry is found with a mismatched
193 /// version, then the map behaves as if the entry does not exist.
194version: u16,
195/// The total number of entries this map can store.
196capacity: usize,
197/// The actual entries, keyed by hash. Collisions between different states
198 /// result in the old state being dropped.
199map: Vec<Utf8SuffixEntry>,
200}
201202/// A key that uniquely identifies an NFA state. It is a triple that represents
203/// a transition from one state for a particular byte range.
204#[derive(#[automatically_derived]
impl ::core::clone::Clone for Utf8SuffixKey {
#[inline]
fn clone(&self) -> Utf8SuffixKey {
Utf8SuffixKey {
from: ::core::clone::Clone::clone(&self.from),
start: ::core::clone::Clone::clone(&self.start),
end: ::core::clone::Clone::clone(&self.end),
}
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for Utf8SuffixKey {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field3_finish(f, "Utf8SuffixKey",
"from", &self.from, "start", &self.start, "end", &&self.end)
}
}Debug, #[automatically_derived]
impl ::core::default::Default for Utf8SuffixKey {
#[inline]
fn default() -> Utf8SuffixKey {
Utf8SuffixKey {
from: ::core::default::Default::default(),
start: ::core::default::Default::default(),
end: ::core::default::Default::default(),
}
}
}Default, #[automatically_derived]
impl ::core::cmp::Eq for Utf8SuffixKey {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<StateID>;
let _: ::core::cmp::AssertParamIsEq<u8>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for Utf8SuffixKey {
#[inline]
fn eq(&self, other: &Utf8SuffixKey) -> bool {
self.start == other.start && self.end == other.end &&
self.from == other.from
}
}PartialEq)]
205pub struct Utf8SuffixKey {
206pub from: StateID,
207pub start: u8,
208pub end: u8,
209}
210211/// An entry in this map.
212#[derive(#[automatically_derived]
impl ::core::clone::Clone for Utf8SuffixEntry {
#[inline]
fn clone(&self) -> Utf8SuffixEntry {
Utf8SuffixEntry {
version: ::core::clone::Clone::clone(&self.version),
key: ::core::clone::Clone::clone(&self.key),
val: ::core::clone::Clone::clone(&self.val),
}
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for Utf8SuffixEntry {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field3_finish(f,
"Utf8SuffixEntry", "version", &self.version, "key", &self.key,
"val", &&self.val)
}
}Debug, #[automatically_derived]
impl ::core::default::Default for Utf8SuffixEntry {
#[inline]
fn default() -> Utf8SuffixEntry {
Utf8SuffixEntry {
version: ::core::default::Default::default(),
key: ::core::default::Default::default(),
val: ::core::default::Default::default(),
}
}
}Default)]
213struct Utf8SuffixEntry {
214/// The version of the map used to produce this entry. If this entry's
215 /// version does not match the current version of the map, then the map
216 /// should behave as if this entry does not exist.
217version: u16,
218/// The key, which consists of a transition in a particular state.
219key: Utf8SuffixKey,
220/// The identifier that the transition in the key maps to.
221val: StateID,
222}
223224impl Utf8SuffixMap {
225/// Create a new bounded map with the given capacity. The map will never
226 /// grow beyond the given size.
227 ///
228 /// Note that this does not allocate. Instead, callers must call `clear`
229 /// before using this map. `clear` will allocate space if necessary.
230 ///
231 /// This avoids the need to pay for the allocation of this map when
232 /// compiling regexes that lack large Unicode character classes.
233pub fn new(capacity: usize) -> Utf8SuffixMap {
234if !(capacity > 0) {
::core::panicking::panic("assertion failed: capacity > 0")
};assert!(capacity > 0);
235Utf8SuffixMap { version: 0, capacity, map: ::alloc::vec::Vec::new()vec![] }
236 }
237238/// Clear this map of all entries, but permit the reuse of allocation
239 /// if possible.
240 ///
241 /// This must be called before the map can be used.
242pub fn clear(&mut self) {
243if self.map.is_empty() {
244self.map = ::alloc::vec::from_elem(Utf8SuffixEntry::default(), self.capacity)vec![Utf8SuffixEntry::default(); self.capacity];
245 } else {
246self.version = self.version.wrapping_add(1);
247if self.version == 0 {
248self.map = ::alloc::vec::from_elem(Utf8SuffixEntry::default(), self.capacity)vec![Utf8SuffixEntry::default(); self.capacity];
249 }
250 }
251 }
252253/// Return a hash of the given transition.
254pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
255// Basic FNV-1a hash as described:
256 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
257const PRIME: u64 = 1099511628211;
258const INIT: u64 = 14695981039346656037;
259260let mut h = INIT;
261h = (h ^ key.from.as_u64()).wrapping_mul(PRIME);
262h = (h ^ u64::from(key.start)).wrapping_mul(PRIME);
263h = (h ^ u64::from(key.end)).wrapping_mul(PRIME);
264 (h % self.map.len().as_u64()).as_usize()
265 }
266267/// Retrieve the cached state ID corresponding to the given key. The hash
268 /// given must have been computed with `hash` using the same key value.
269 ///
270 /// If there is no cached state with the given key, then None is returned.
271pub fn get(
272&mut self,
273 key: &Utf8SuffixKey,
274 hash: usize,
275 ) -> Option<StateID> {
276let entry = &self.map[hash];
277if entry.version != self.version {
278return None;
279 }
280if key != &entry.key {
281return None;
282 }
283Some(entry.val)
284 }
285286/// Add a cached state to this map with the given key. Callers should
287 /// ensure that `state_id` points to a state that contains precisely the
288 /// NFA transition given.
289 ///
290 /// `hash` must have been computed using the `hash` method with the same
291 /// key.
292pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
293self.map[hash] =
294Utf8SuffixEntry { version: self.version, key, val: state_id };
295 }
296}