1use alloc::{sync::Arc, vec, vec::Vec};
23use crate::{packed::pattern::Patterns, util::search::Match, PatternID};
45/// The type of the rolling hash used in the Rabin-Karp algorithm.
6type Hash = usize;
78/// The number of buckets to store our patterns in. We don't want this to be
9/// too big in order to avoid wasting memory, but we don't want it to be too
10/// small either to avoid spending too much time confirming literals.
11///
12/// The number of buckets MUST be a power of two. Otherwise, determining the
13/// bucket from a hash will slow down the code considerably. Using a power
14/// of two means `hash % NUM_BUCKETS` can compile down to a simple `and`
15/// instruction.
16const NUM_BUCKETS: usize = 64;
1718/// An implementation of the Rabin-Karp algorithm. The main idea of this
19/// algorithm is to maintain a rolling hash as it moves through the input, and
20/// then check whether that hash corresponds to the same hash for any of the
21/// patterns we're looking for.
22///
23/// A draw back of naively scaling Rabin-Karp to multiple patterns is that
24/// it requires all of the patterns to be the same length, which in turn
25/// corresponds to the number of bytes to hash. We adapt this to work for
26/// multiple patterns of varying size by fixing the number of bytes to hash
27/// to be the length of the smallest pattern. We also split the patterns into
28/// several buckets to hopefully make the confirmation step faster.
29///
30/// Wikipedia has a decent explanation, if a bit heavy on the theory:
31/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
32///
33/// But ESMAJ provides something a bit more concrete:
34/// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html
35#[derive(#[automatically_derived]
impl ::core::clone::Clone for RabinKarp {
#[inline]
fn clone(&self) -> RabinKarp {
RabinKarp {
patterns: ::core::clone::Clone::clone(&self.patterns),
buckets: ::core::clone::Clone::clone(&self.buckets),
hash_len: ::core::clone::Clone::clone(&self.hash_len),
hash_2pow: ::core::clone::Clone::clone(&self.hash_2pow),
}
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for RabinKarp {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field4_finish(f, "RabinKarp",
"patterns", &self.patterns, "buckets", &self.buckets, "hash_len",
&self.hash_len, "hash_2pow", &&self.hash_2pow)
}
}Debug)]
36pub(crate) struct RabinKarp {
37/// The patterns we're searching for.
38patterns: Arc<Patterns>,
39/// The order of patterns in each bucket is significant. Namely, they are
40 /// arranged such that the first one to match is the correct match. This
41 /// may not necessarily correspond to the order provided by the caller.
42 /// For example, if leftmost-longest semantics are used, then the patterns
43 /// are sorted by their length in descending order. If leftmost-first
44 /// semantics are used, then the patterns are sorted by their pattern ID
45 /// in ascending order (which corresponds to the caller's order).
46buckets: Vec<Vec<(Hash, PatternID)>>,
47/// The length of the hashing window. Generally, this corresponds to the
48 /// length of the smallest pattern.
49hash_len: usize,
50/// The factor to subtract out of a hash before updating it with a new
51 /// byte.
52hash_2pow: usize,
53}
5455impl RabinKarp {
56/// Compile a new Rabin-Karp matcher from the patterns given.
57 ///
58 /// This panics if any of the patterns in the collection are empty, or if
59 /// the collection is itself empty.
60pub(crate) fn new(patterns: &Arc<Patterns>) -> RabinKarp {
61if !(patterns.len() >= 1) {
::core::panicking::panic("assertion failed: patterns.len() >= 1")
};assert!(patterns.len() >= 1);
62let hash_len = patterns.minimum_len();
63if !(hash_len >= 1) {
::core::panicking::panic("assertion failed: hash_len >= 1")
};assert!(hash_len >= 1);
6465let mut hash_2pow = 1usize;
66for _ in 1..hash_len {
67 hash_2pow = hash_2pow.wrapping_shl(1);
68 }
6970let mut rk = RabinKarp {
71 patterns: Arc::clone(patterns),
72 buckets: ::alloc::vec::from_elem(::alloc::vec::Vec::new(), NUM_BUCKETS)vec![vec![]; NUM_BUCKETS],
73hash_len,
74hash_2pow,
75 };
76for (id, pat) in patterns.iter() {
77let hash = rk.hash(&pat.bytes()[..rk.hash_len]);
78let bucket = hash % NUM_BUCKETS;
79 rk.buckets[bucket].push((hash, id));
80 }
81rk82 }
8384/// Return the first matching pattern in the given haystack, begining the
85 /// search at `at`.
86pub(crate) fn find_at(
87&self,
88 haystack: &[u8],
89mut at: usize,
90 ) -> Option<Match> {
91match (&NUM_BUCKETS, &self.buckets.len()) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val, &*right_val,
::core::option::Option::None);
}
}
};assert_eq!(NUM_BUCKETS, self.buckets.len());
9293if at + self.hash_len > haystack.len() {
94return None;
95 }
96let mut hash = self.hash(&haystack[at..at + self.hash_len]);
97loop {
98let bucket = &self.buckets[hash % NUM_BUCKETS];
99for &(phash, pid) in bucket {
100if phash == hash {
101if let Some(c) = self.verify(pid, haystack, at) {
102return Some(c);
103 }
104 }
105 }
106if at + self.hash_len >= haystack.len() {
107return None;
108 }
109hash = self.update_hash(
110hash,
111haystack[at],
112haystack[at + self.hash_len],
113 );
114at += 1;
115 }
116 }
117118/// Returns the approximate total amount of heap used by this searcher, in
119 /// units of bytes.
120pub(crate) fn memory_usage(&self) -> usize {
121self.buckets.len() * core::mem::size_of::<Vec<(Hash, PatternID)>>()
122 + self.patterns.len() * core::mem::size_of::<(Hash, PatternID)>()
123 }
124125/// Verify whether the pattern with the given id matches at
126 /// `haystack[at..]`.
127 ///
128 /// We tag this function as `cold` because it helps improve codegen.
129 /// Intuitively, it would seem like inlining it would be better. However,
130 /// the only time this is called and a match is not found is when there
131 /// there is a hash collision, or when a prefix of a pattern matches but
132 /// the entire pattern doesn't match. This is hopefully fairly rare, and
133 /// if it does occur a lot, it's going to be slow no matter what we do.
134#[cold]
135fn verify(
136&self,
137 id: PatternID,
138 haystack: &[u8],
139 at: usize,
140 ) -> Option<Match> {
141let pat = self.patterns.get(id);
142if pat.is_prefix(&haystack[at..]) {
143Some(Match::new(id, at..at + pat.len()))
144 } else {
145None146 }
147 }
148149/// Hash the given bytes.
150fn hash(&self, bytes: &[u8]) -> Hash {
151match (&self.hash_len, &bytes.len()) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val, &*right_val,
::core::option::Option::None);
}
}
};assert_eq!(self.hash_len, bytes.len());
152153let mut hash = 0usize;
154for &b in bytes {
155 hash = hash.wrapping_shl(1).wrapping_add(b as usize);
156 }
157hash158 }
159160/// Update the hash given based on removing `old_byte` at the beginning
161 /// of some byte string, and appending `new_byte` to the end of that same
162 /// byte string.
163fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash {
164prev.wrapping_sub((old_byteas usize).wrapping_mul(self.hash_2pow))
165 .wrapping_shl(1)
166 .wrapping_add(new_byteas usize)
167 }
168}