1/*!
2An implementation of the [Two-Way substring search algorithm][two-way].
34[`Finder`] can be built for forward searches, while [`FinderRev`] can be built
5for reverse searches.
67Two-Way makes for a nice general purpose substring search algorithm because of
8its time and space complexity properties. It also performs well in practice.
9Namely, with `m = len(needle)` and `n = len(haystack)`, Two-Way takes `O(m)`
10time to create a finder, `O(1)` space and `O(n)` search time. In other words,
11the preprocessing step is quick, doesn't require any heap memory and the worst
12case search time is guaranteed to be linear in the haystack regardless of the
13size of the needle.
1415While vector algorithms will usually beat Two-Way handedly, vector algorithms
16also usually have pathological or edge cases that are better handled by Two-Way.
17Moreover, not all targets support vector algorithms or implementations for them
18simply may not exist yet.
1920Two-Way can be found in the `memmem` implementations in at least [GNU libc] and
21[musl].
2223[two-way]: https://en.wikipedia.org/wiki/Two-way_string-matching_algorithm
24[GNU libc]: https://www.gnu.org/software/libc/
25[musl]: https://www.musl-libc.org/
26*/
2728use core::cmp;
2930use crate::{
31 arch::all::{is_prefix, is_suffix},
32memmem::Pre,
33};
3435/// A forward substring searcher that uses the Two-Way algorithm.
36#[derive(#[automatically_derived]
impl ::core::clone::Clone for Finder {
#[inline]
fn clone(&self) -> Finder {
let _: ::core::clone::AssertParamIsClone<TwoWay>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for Finder { }Copy, #[automatically_derived]
impl ::core::fmt::Debug for Finder {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_tuple_field1_finish(f, "Finder",
&&self.0)
}
}Debug)]
37pub struct Finder(TwoWay);
3839/// A reverse substring searcher that uses the Two-Way algorithm.
40#[derive(#[automatically_derived]
impl ::core::clone::Clone for FinderRev {
#[inline]
fn clone(&self) -> FinderRev {
let _: ::core::clone::AssertParamIsClone<TwoWay>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for FinderRev { }Copy, #[automatically_derived]
impl ::core::fmt::Debug for FinderRev {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_tuple_field1_finish(f, "FinderRev",
&&self.0)
}
}Debug)]
41pub struct FinderRev(TwoWay);
4243/// An implementation of the TwoWay substring search algorithm.
44///
45/// This searcher supports forward and reverse search, although not
46/// simultaneously. It runs in `O(n + m)` time and `O(1)` space, where
47/// `n ~ len(needle)` and `m ~ len(haystack)`.
48///
49/// The implementation here roughly matches that which was developed by
50/// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The
51/// changes in this implementation are 1) the use of zero-based indices, 2) a
52/// heuristic skip table based on the last byte (borrowed from Rust's standard
53/// library) and 3) the addition of heuristics for a fast skip loop. For (3),
54/// callers can pass any kind of prefilter they want, but usually it's one
55/// based on a heuristic that uses an approximate background frequency of bytes
56/// to choose rare bytes to quickly look for candidate match positions. Note
57/// though that currently, this prefilter functionality is not exposed directly
58/// in the public API. (File an issue if you want it and provide a use case
59/// please.)
60///
61/// The heuristic for fast skipping is automatically shut off if it's
62/// detected to be ineffective at search time. Generally, this only occurs in
63/// pathological cases. But this is generally necessary in order to preserve
64/// a `O(n + m)` time bound.
65///
66/// The code below is fairly complex and not obviously correct at all. It's
67/// likely necessary to read the Two-Way paper cited above in order to fully
68/// grok this code. The essence of it is:
69///
70/// 1. Do something to detect a "critical" position in the needle.
71/// 2. For the current position in the haystack, look if `needle[critical..]`
72/// matches at that position.
73/// 3. If so, look if `needle[..critical]` matches.
74/// 4. If a mismatch occurs, shift the search by some amount based on the
75/// critical position and a pre-computed shift.
76///
77/// This type is wrapped in the forward and reverse finders that expose
78/// consistent forward or reverse APIs.
79#[derive(#[automatically_derived]
impl ::core::clone::Clone for TwoWay {
#[inline]
fn clone(&self) -> TwoWay {
let _: ::core::clone::AssertParamIsClone<ApproximateByteSet>;
let _: ::core::clone::AssertParamIsClone<usize>;
let _: ::core::clone::AssertParamIsClone<Shift>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for TwoWay { }Copy, #[automatically_derived]
impl ::core::fmt::Debug for TwoWay {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field3_finish(f, "TwoWay",
"byteset", &self.byteset, "critical_pos", &self.critical_pos,
"shift", &&self.shift)
}
}Debug)]
80struct TwoWay {
81/// A small bitset used as a quick prefilter (in addition to any prefilter
82 /// given by the caller). Namely, a bit `i` is set if and only if `b%64==i`
83 /// for any `b == needle[i]`.
84 ///
85 /// When used as a prefilter, if the last byte at the current candidate
86 /// position is NOT in this set, then we can skip that entire candidate
87 /// position (the length of the needle). This is essentially the shift
88 /// trick found in Boyer-Moore, but only applied to bytes that don't appear
89 /// in the needle.
90 ///
91 /// N.B. This trick was inspired by something similar in std's
92 /// implementation of Two-Way.
93byteset: ApproximateByteSet,
94/// A critical position in needle. Specifically, this position corresponds
95 /// to beginning of either the minimal or maximal suffix in needle. (N.B.
96 /// See SuffixType below for why "minimal" isn't quite the correct word
97 /// here.)
98 ///
99 /// This is the position at which every search begins. Namely, search
100 /// starts by scanning text to the right of this position, and only if
101 /// there's a match does the text to the left of this position get scanned.
102critical_pos: usize,
103/// The amount we shift by in the Two-Way search algorithm. This
104 /// corresponds to the "small period" and "large period" cases.
105shift: Shift,
106}
107108impl Finder {
109/// Create a searcher that finds occurrences of the given `needle`.
110 ///
111 /// An empty `needle` results in a match at every position in a haystack,
112 /// including at `haystack.len()`.
113#[inline]
114pub fn new(needle: &[u8]) -> Finder {
115let byteset = ApproximateByteSet::new(needle);
116let min_suffix = Suffix::forward(needle, SuffixKind::Minimal);
117let max_suffix = Suffix::forward(needle, SuffixKind::Maximal);
118let (period_lower_bound, critical_pos) =
119if min_suffix.pos > max_suffix.pos {
120 (min_suffix.period, min_suffix.pos)
121 } else {
122 (max_suffix.period, max_suffix.pos)
123 };
124let shift = Shift::forward(needle, period_lower_bound, critical_pos);
125Finder(TwoWay { byteset, critical_pos, shift })
126 }
127128/// Returns the first occurrence of `needle` in the given `haystack`, or
129 /// `None` if no such occurrence could be found.
130 ///
131 /// The `needle` given must be the same as the `needle` provided to
132 /// [`Finder::new`].
133 ///
134 /// An empty `needle` results in a match at every position in a haystack,
135 /// including at `haystack.len()`.
136#[inline]
137pub fn find(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
138self.find_with_prefilter(None, haystack, needle)
139 }
140141/// This is like [`Finder::find`], but it accepts a prefilter for
142 /// accelerating searches.
143 ///
144 /// Currently this is not exposed in the public API because, at the time
145 /// of writing, I didn't want to spend time thinking about how to expose
146 /// the prefilter infrastructure (if at all). If you have a compelling use
147 /// case for exposing this routine, please create an issue. Do *not* open
148 /// a PR that just exposes `Pre` and friends. Exporting this routine will
149 /// require API design.
150#[inline(always)]
151pub(crate) fn find_with_prefilter(
152&self,
153 pre: Option<Pre<'_>>,
154 haystack: &[u8],
155 needle: &[u8],
156 ) -> Option<usize> {
157match self.0.shift {
158 Shift::Small { period } => {
159self.find_small_imp(pre, haystack, needle, period)
160 }
161 Shift::Large { shift } => {
162self.find_large_imp(pre, haystack, needle, shift)
163 }
164 }
165 }
166167// Each of the two search implementations below can be accelerated by a
168 // prefilter, but it is not always enabled. To avoid its overhead when
169 // its disabled, we explicitly inline each search implementation based on
170 // whether a prefilter will be used or not. The decision on which to use
171 // is made in the parent meta searcher.
172173#[inline(always)]
174fn find_small_imp(
175&self,
176mut pre: Option<Pre<'_>>,
177 haystack: &[u8],
178 needle: &[u8],
179 period: usize,
180 ) -> Option<usize> {
181let mut pos = 0;
182let mut shift = 0;
183let last_byte_pos = match needle.len().checked_sub(1) {
184None => return Some(pos),
185Some(last_byte) => last_byte,
186 };
187while pos + needle.len() <= haystack.len() {
188let mut i = cmp::max(self.0.critical_pos, shift);
189if let Some(pre) = pre.as_mut() {
190if pre.is_effective() {
191 pos += pre.find(&haystack[pos..])?;
192 shift = 0;
193 i = self.0.critical_pos;
194if pos + needle.len() > haystack.len() {
195return None;
196 }
197 }
198 }
199if !self.0.byteset.contains(haystack[pos + last_byte_pos]) {
200 pos += needle.len();
201 shift = 0;
202continue;
203 }
204while i < needle.len() && needle[i] == haystack[pos + i] {
205 i += 1;
206 }
207if i < needle.len() {
208 pos += i - self.0.critical_pos + 1;
209 shift = 0;
210 } else {
211let mut j = self.0.critical_pos;
212while j > shift && needle[j] == haystack[pos + j] {
213 j -= 1;
214 }
215if j <= shift && needle[shift] == haystack[pos + shift] {
216return Some(pos);
217 }
218 pos += period;
219 shift = needle.len() - period;
220 }
221 }
222None223 }
224225#[inline(always)]
226fn find_large_imp(
227&self,
228mut pre: Option<Pre<'_>>,
229 haystack: &[u8],
230 needle: &[u8],
231 shift: usize,
232 ) -> Option<usize> {
233let mut pos = 0;
234let last_byte_pos = match needle.len().checked_sub(1) {
235None => return Some(pos),
236Some(last_byte) => last_byte,
237 };
238'outer: while pos + needle.len() <= haystack.len() {
239if let Some(pre) = pre.as_mut() {
240if pre.is_effective() {
241 pos += pre.find(&haystack[pos..])?;
242if pos + needle.len() > haystack.len() {
243return None;
244 }
245 }
246 }
247248if !self.0.byteset.contains(haystack[pos + last_byte_pos]) {
249 pos += needle.len();
250continue;
251 }
252let mut i = self.0.critical_pos;
253while i < needle.len() && needle[i] == haystack[pos + i] {
254 i += 1;
255 }
256if i < needle.len() {
257 pos += i - self.0.critical_pos + 1;
258 } else {
259for j in (0..self.0.critical_pos).rev() {
260if needle[j] != haystack[pos + j] {
261 pos += shift;
262continue 'outer;
263 }
264 }
265return Some(pos);
266 }
267 }
268None269 }
270}
271272impl FinderRev {
273/// Create a searcher that finds occurrences of the given `needle`.
274 ///
275 /// An empty `needle` results in a match at every position in a haystack,
276 /// including at `haystack.len()`.
277#[inline]
278pub fn new(needle: &[u8]) -> FinderRev {
279let byteset = ApproximateByteSet::new(needle);
280let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal);
281let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal);
282let (period_lower_bound, critical_pos) =
283if min_suffix.pos < max_suffix.pos {
284 (min_suffix.period, min_suffix.pos)
285 } else {
286 (max_suffix.period, max_suffix.pos)
287 };
288let shift = Shift::reverse(needle, period_lower_bound, critical_pos);
289FinderRev(TwoWay { byteset, critical_pos, shift })
290 }
291292/// Returns the last occurrence of `needle` in the given `haystack`, or
293 /// `None` if no such occurrence could be found.
294 ///
295 /// The `needle` given must be the same as the `needle` provided to
296 /// [`FinderRev::new`].
297 ///
298 /// An empty `needle` results in a match at every position in a haystack,
299 /// including at `haystack.len()`.
300#[inline]
301pub fn rfind(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
302// For the reverse case, we don't use a prefilter. It's plausible that
303 // perhaps we should, but it's a lot of additional code to do it, and
304 // it's not clear that it's actually worth it. If you have a really
305 // compelling use case for this, please file an issue.
306match self.0.shift {
307 Shift::Small { period } => {
308self.rfind_small_imp(haystack, needle, period)
309 }
310 Shift::Large { shift } => {
311self.rfind_large_imp(haystack, needle, shift)
312 }
313 }
314 }
315316#[inline(always)]
317fn rfind_small_imp(
318&self,
319 haystack: &[u8],
320 needle: &[u8],
321 period: usize,
322 ) -> Option<usize> {
323let nlen = needle.len();
324let mut pos = haystack.len();
325let mut shift = nlen;
326let first_byte = match needle.get(0) {
327None => return Some(pos),
328Some(&first_byte) => first_byte,
329 };
330while pos >= nlen {
331if !self.0.byteset.contains(haystack[pos - nlen]) {
332 pos -= nlen;
333 shift = nlen;
334continue;
335 }
336let mut i = cmp::min(self.0.critical_pos, shift);
337while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
338 i -= 1;
339 }
340if i > 0 || first_byte != haystack[pos - nlen] {
341 pos -= self.0.critical_pos - i + 1;
342 shift = nlen;
343 } else {
344let mut j = self.0.critical_pos;
345while j < shift && needle[j] == haystack[pos - nlen + j] {
346 j += 1;
347 }
348if j >= shift {
349return Some(pos - nlen);
350 }
351 pos -= period;
352 shift = period;
353 }
354 }
355None356 }
357358#[inline(always)]
359fn rfind_large_imp(
360&self,
361 haystack: &[u8],
362 needle: &[u8],
363 shift: usize,
364 ) -> Option<usize> {
365let nlen = needle.len();
366let mut pos = haystack.len();
367let first_byte = match needle.get(0) {
368None => return Some(pos),
369Some(&first_byte) => first_byte,
370 };
371while pos >= nlen {
372if !self.0.byteset.contains(haystack[pos - nlen]) {
373 pos -= nlen;
374continue;
375 }
376let mut i = self.0.critical_pos;
377while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
378 i -= 1;
379 }
380if i > 0 || first_byte != haystack[pos - nlen] {
381 pos -= self.0.critical_pos - i + 1;
382 } else {
383let mut j = self.0.critical_pos;
384while j < nlen && needle[j] == haystack[pos - nlen + j] {
385 j += 1;
386 }
387if j == nlen {
388return Some(pos - nlen);
389 }
390 pos -= shift;
391 }
392 }
393None394 }
395}
396397/// A representation of the amount we're allowed to shift by during Two-Way
398/// search.
399///
400/// When computing a critical factorization of the needle, we find the position
401/// of the critical factorization by finding the needle's maximal (or minimal)
402/// suffix, along with the period of that suffix. It turns out that the period
403/// of that suffix is a lower bound on the period of the needle itself.
404///
405/// This lower bound is equivalent to the actual period of the needle in
406/// some cases. To describe that case, we denote the needle as `x` where
407/// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower
408/// bound given here is always the period of `v`, which is `<= period(x)`. The
409/// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and
410/// where `u` is a suffix of `v[0..period(v)]`.
411///
412/// This case is important because the search algorithm for when the
413/// periods are equivalent is slightly different than the search algorithm
414/// for when the periods are not equivalent. In particular, when they aren't
415/// equivalent, we know that the period of the needle is no less than half its
416/// length. In this case, we shift by an amount less than or equal to the
417/// period of the needle (determined by the maximum length of the components
418/// of the critical factorization of `x`, i.e., `max(len(u), len(v))`)..
419///
420/// The above two cases are represented by the variants below. Each entails
421/// a different instantiation of the Two-Way search algorithm.
422///
423/// N.B. If we could find a way to compute the exact period in all cases,
424/// then we could collapse this case analysis and simplify the algorithm. The
425/// Two-Way paper suggests this is possible, but more reading is required to
426/// grok why the authors didn't pursue that path.
427#[derive(#[automatically_derived]
impl ::core::clone::Clone for Shift {
#[inline]
fn clone(&self) -> Shift {
let _: ::core::clone::AssertParamIsClone<usize>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for Shift { }Copy, #[automatically_derived]
impl ::core::fmt::Debug for Shift {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
match self {
Shift::Small { period: __self_0 } =>
::core::fmt::Formatter::debug_struct_field1_finish(f, "Small",
"period", &__self_0),
Shift::Large { shift: __self_0 } =>
::core::fmt::Formatter::debug_struct_field1_finish(f, "Large",
"shift", &__self_0),
}
}
}Debug)]
428enum Shift {
429 Small { period: usize },
430 Large { shift: usize },
431}
432433impl Shift {
434/// Compute the shift for a given needle in the forward direction.
435 ///
436 /// This requires a lower bound on the period and a critical position.
437 /// These can be computed by extracting both the minimal and maximal
438 /// lexicographic suffixes, and choosing the right-most starting position.
439 /// The lower bound on the period is then the period of the chosen suffix.
440fn forward(
441 needle: &[u8],
442 period_lower_bound: usize,
443 critical_pos: usize,
444 ) -> Shift {
445let large = cmp::max(critical_pos, needle.len() - critical_pos);
446if critical_pos * 2 >= needle.len() {
447return Shift::Large { shift: large };
448 }
449450let (u, v) = needle.split_at(critical_pos);
451if !is_suffix(&v[..period_lower_bound], u) {
452return Shift::Large { shift: large };
453 }
454 Shift::Small { period: period_lower_bound }
455 }
456457/// Compute the shift for a given needle in the reverse direction.
458 ///
459 /// This requires a lower bound on the period and a critical position.
460 /// These can be computed by extracting both the minimal and maximal
461 /// lexicographic suffixes, and choosing the left-most starting position.
462 /// The lower bound on the period is then the period of the chosen suffix.
463fn reverse(
464 needle: &[u8],
465 period_lower_bound: usize,
466 critical_pos: usize,
467 ) -> Shift {
468let large = cmp::max(critical_pos, needle.len() - critical_pos);
469if (needle.len() - critical_pos) * 2 >= needle.len() {
470return Shift::Large { shift: large };
471 }
472473let (v, u) = needle.split_at(critical_pos);
474if !is_prefix(&v[v.len() - period_lower_bound..], u) {
475return Shift::Large { shift: large };
476 }
477 Shift::Small { period: period_lower_bound }
478 }
479}
480481/// A suffix extracted from a needle along with its period.
482#[derive(#[automatically_derived]
impl ::core::fmt::Debug for Suffix {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field2_finish(f, "Suffix", "pos",
&self.pos, "period", &&self.period)
}
}Debug)]
483struct Suffix {
484/// The starting position of this suffix.
485 ///
486 /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this
487 /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for
488 /// forward suffixes, this is an inclusive starting position, where as for
489 /// reverse suffixes, this is an exclusive ending position.
490pos: usize,
491/// The period of this suffix.
492 ///
493 /// Note that this is NOT necessarily the period of the string from which
494 /// this suffix comes from. (It is always less than or equal to the period
495 /// of the original string.)
496period: usize,
497}
498499impl Suffix {
500fn forward(needle: &[u8], kind: SuffixKind) -> Suffix {
501// suffix represents our maximal (or minimal) suffix, along with
502 // its period.
503let mut suffix = Suffix { pos: 0, period: 1 };
504// The start of a suffix in `needle` that we are considering as a
505 // more maximal (or minimal) suffix than what's in `suffix`.
506let mut candidate_start = 1;
507// The current offset of our suffixes that we're comparing.
508 //
509 // When the characters at this offset are the same, then we mush on
510 // to the next position since no decision is possible. When the
511 // candidate's character is greater (or lesser) than the corresponding
512 // character than our current maximal (or minimal) suffix, then the
513 // current suffix is changed over to the candidate and we restart our
514 // search. Otherwise, the candidate suffix is no good and we restart
515 // our search on the next candidate.
516 //
517 // The three cases above correspond to the three cases in the loop
518 // below.
519let mut offset = 0;
520521while candidate_start + offset < needle.len() {
522let current = needle[suffix.pos + offset];
523let candidate = needle[candidate_start + offset];
524match kind.cmp(current, candidate) {
525 SuffixOrdering::Accept => {
526 suffix = Suffix { pos: candidate_start, period: 1 };
527 candidate_start += 1;
528 offset = 0;
529 }
530 SuffixOrdering::Skip => {
531 candidate_start += offset + 1;
532 offset = 0;
533 suffix.period = candidate_start - suffix.pos;
534 }
535 SuffixOrdering::Push => {
536if offset + 1 == suffix.period {
537 candidate_start += suffix.period;
538 offset = 0;
539 } else {
540 offset += 1;
541 }
542 }
543 }
544 }
545suffix546 }
547548fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix {
549// See the comments in `forward` for how this works.
550let mut suffix = Suffix { pos: needle.len(), period: 1 };
551if needle.len() == 1 {
552return suffix;
553 }
554let mut candidate_start = match needle.len().checked_sub(1) {
555None => return suffix,
556Some(candidate_start) => candidate_start,
557 };
558let mut offset = 0;
559560while offset < candidate_start {
561let current = needle[suffix.pos - offset - 1];
562let candidate = needle[candidate_start - offset - 1];
563match kind.cmp(current, candidate) {
564 SuffixOrdering::Accept => {
565 suffix = Suffix { pos: candidate_start, period: 1 };
566 candidate_start -= 1;
567 offset = 0;
568 }
569 SuffixOrdering::Skip => {
570 candidate_start -= offset + 1;
571 offset = 0;
572 suffix.period = suffix.pos - candidate_start;
573 }
574 SuffixOrdering::Push => {
575if offset + 1 == suffix.period {
576 candidate_start -= suffix.period;
577 offset = 0;
578 } else {
579 offset += 1;
580 }
581 }
582 }
583 }
584suffix585 }
586}
587588/// The kind of suffix to extract.
589#[derive(#[automatically_derived]
impl ::core::clone::Clone for SuffixKind {
#[inline]
fn clone(&self) -> SuffixKind { *self }
}Clone, #[automatically_derived]
impl ::core::marker::Copy for SuffixKind { }Copy, #[automatically_derived]
impl ::core::fmt::Debug for SuffixKind {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::write_str(f,
match self {
SuffixKind::Minimal => "Minimal",
SuffixKind::Maximal => "Maximal",
})
}
}Debug)]
590enum SuffixKind {
591/// Extract the smallest lexicographic suffix from a string.
592 ///
593 /// Technically, this doesn't actually pick the smallest lexicographic
594 /// suffix. e.g., Given the choice between `a` and `aa`, this will choose
595 /// the latter over the former, even though `a < aa`. The reasoning for
596 /// this isn't clear from the paper, but it still smells like a minimal
597 /// suffix.
598Minimal,
599/// Extract the largest lexicographic suffix from a string.
600 ///
601 /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given
602 /// the choice between `z` and `zz`, this will choose the latter over the
603 /// former.
604Maximal,
605}
606607/// The result of comparing corresponding bytes between two suffixes.
608#[derive(#[automatically_derived]
impl ::core::clone::Clone for SuffixOrdering {
#[inline]
fn clone(&self) -> SuffixOrdering { *self }
}Clone, #[automatically_derived]
impl ::core::marker::Copy for SuffixOrdering { }Copy, #[automatically_derived]
impl ::core::fmt::Debug for SuffixOrdering {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::write_str(f,
match self {
SuffixOrdering::Accept => "Accept",
SuffixOrdering::Skip => "Skip",
SuffixOrdering::Push => "Push",
})
}
}Debug)]
609enum SuffixOrdering {
610/// This occurs when the given candidate byte indicates that the candidate
611 /// suffix is better than the current maximal (or minimal) suffix. That is,
612 /// the current candidate suffix should supplant the current maximal (or
613 /// minimal) suffix.
614Accept,
615/// This occurs when the given candidate byte excludes the candidate suffix
616 /// from being better than the current maximal (or minimal) suffix. That
617 /// is, the current candidate suffix should be dropped and the next one
618 /// should be considered.
619Skip,
620/// This occurs when no decision to accept or skip the candidate suffix
621 /// can be made, e.g., when corresponding bytes are equivalent. In this
622 /// case, the next corresponding bytes should be compared.
623Push,
624}
625626impl SuffixKind {
627/// Returns true if and only if the given candidate byte indicates that
628 /// it should replace the current suffix as the maximal (or minimal)
629 /// suffix.
630fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering {
631use self::SuffixOrdering::*;
632633match self {
634 SuffixKind::Minimalif candidate < current => Accept,
635 SuffixKind::Minimalif candidate > current => Skip,
636 SuffixKind::Minimal => Push,
637 SuffixKind::Maximalif candidate > current => Accept,
638 SuffixKind::Maximalif candidate < current => Skip,
639 SuffixKind::Maximal => Push,
640 }
641 }
642}
643644/// A bitset used to track whether a particular byte exists in a needle or not.
645///
646/// Namely, bit 'i' is set if and only if byte%64==i for any byte in the
647/// needle. If a particular byte in the haystack is NOT in this set, then one
648/// can conclude that it is also not in the needle, and thus, one can advance
649/// in the haystack by needle.len() bytes.
650#[derive(#[automatically_derived]
impl ::core::clone::Clone for ApproximateByteSet {
#[inline]
fn clone(&self) -> ApproximateByteSet {
let _: ::core::clone::AssertParamIsClone<u64>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for ApproximateByteSet { }Copy, #[automatically_derived]
impl ::core::fmt::Debug for ApproximateByteSet {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_tuple_field1_finish(f,
"ApproximateByteSet", &&self.0)
}
}Debug)]
651struct ApproximateByteSet(u64);
652653impl ApproximateByteSet {
654/// Create a new set from the given needle.
655fn new(needle: &[u8]) -> ApproximateByteSet {
656let mut bits = 0;
657for &b in needle {
658 bits |= 1 << (b % 64);
659 }
660ApproximateByteSet(bits)
661 }
662663/// Return true if and only if the given byte might be in this set. This
664 /// may return a false positive, but will never return a false negative.
665#[inline(always)]
666fn contains(&self, byte: u8) -> bool {
667self.0 & (1 << (byte % 64)) != 0
668}
669}
670671#[cfg(test)]
672mod tests {
673use alloc::vec::Vec;
674675use super::*;
676677/// Convenience wrapper for computing the suffix as a byte string.
678fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
679let s = Suffix::forward(needle, kind);
680 (&needle[s.pos..], s.period)
681 }
682683/// Convenience wrapper for computing the reverse suffix as a byte string.
684fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
685let s = Suffix::reverse(needle, kind);
686 (&needle[..s.pos], s.period)
687 }
688689/// Return all of the non-empty suffixes in the given byte string.
690fn suffixes(bytes: &[u8]) -> Vec<&[u8]> {
691 (0..bytes.len()).map(|i| &bytes[i..]).collect()
692 }
693694/// Return the lexicographically maximal suffix of the given byte string.
695fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] {
696let mut sufs = suffixes(needle);
697 sufs.sort();
698 sufs.pop().unwrap()
699 }
700701/// Return the lexicographically maximal suffix of the reverse of the given
702 /// byte string.
703fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> {
704let mut reversed = needle.to_vec();
705 reversed.reverse();
706let mut got = naive_maximal_suffix_forward(&reversed).to_vec();
707 got.reverse();
708 got
709 }
710711define_substring_forward_quickcheck!(|h, n| Some(
712 Finder::new(n).find(h, n)
713 ));
714define_substring_reverse_quickcheck!(|h, n| Some(
715 FinderRev::new(n).rfind(h, n)
716 ));
717718#[test]
719fn forward() {
720crate::tests::substring::Runner::new()
721 .fwd(|h, n| Some(Finder::new(n).find(h, n)))
722 .run();
723 }
724725#[test]
726fn reverse() {
727crate::tests::substring::Runner::new()
728 .rev(|h, n| Some(FinderRev::new(n).rfind(h, n)))
729 .run();
730 }
731732#[test]
733fn suffix_forward() {
734macro_rules! assert_suffix_min {
735 ($given:expr, $expected:expr, $period:expr) => {
736let (got_suffix, got_period) =
737 get_suffix_forward($given.as_bytes(), SuffixKind::Minimal);
738let got_suffix = core::str::from_utf8(got_suffix).unwrap();
739assert_eq!(($expected, $period), (got_suffix, got_period));
740 };
741 }
742743macro_rules! assert_suffix_max {
744 ($given:expr, $expected:expr, $period:expr) => {
745let (got_suffix, got_period) =
746 get_suffix_forward($given.as_bytes(), SuffixKind::Maximal);
747let got_suffix = core::str::from_utf8(got_suffix).unwrap();
748assert_eq!(($expected, $period), (got_suffix, got_period));
749 };
750 }
751752assert_suffix_min!("a", "a", 1);
753assert_suffix_max!("a", "a", 1);
754755assert_suffix_min!("ab", "ab", 2);
756assert_suffix_max!("ab", "b", 1);
757758assert_suffix_min!("ba", "a", 1);
759assert_suffix_max!("ba", "ba", 2);
760761assert_suffix_min!("abc", "abc", 3);
762assert_suffix_max!("abc", "c", 1);
763764assert_suffix_min!("acb", "acb", 3);
765assert_suffix_max!("acb", "cb", 2);
766767assert_suffix_min!("cba", "a", 1);
768assert_suffix_max!("cba", "cba", 3);
769770assert_suffix_min!("abcabc", "abcabc", 3);
771assert_suffix_max!("abcabc", "cabc", 3);
772773assert_suffix_min!("abcabcabc", "abcabcabc", 3);
774assert_suffix_max!("abcabcabc", "cabcabc", 3);
775776assert_suffix_min!("abczz", "abczz", 5);
777assert_suffix_max!("abczz", "zz", 1);
778779assert_suffix_min!("zzabc", "abc", 3);
780assert_suffix_max!("zzabc", "zzabc", 5);
781782assert_suffix_min!("aaa", "aaa", 1);
783assert_suffix_max!("aaa", "aaa", 1);
784785assert_suffix_min!("foobar", "ar", 2);
786assert_suffix_max!("foobar", "r", 1);
787 }
788789#[test]
790fn suffix_reverse() {
791macro_rules! assert_suffix_min {
792 ($given:expr, $expected:expr, $period:expr) => {
793let (got_suffix, got_period) =
794 get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal);
795let got_suffix = core::str::from_utf8(got_suffix).unwrap();
796assert_eq!(($expected, $period), (got_suffix, got_period));
797 };
798 }
799800macro_rules! assert_suffix_max {
801 ($given:expr, $expected:expr, $period:expr) => {
802let (got_suffix, got_period) =
803 get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal);
804let got_suffix = core::str::from_utf8(got_suffix).unwrap();
805assert_eq!(($expected, $period), (got_suffix, got_period));
806 };
807 }
808809assert_suffix_min!("a", "a", 1);
810assert_suffix_max!("a", "a", 1);
811812assert_suffix_min!("ab", "a", 1);
813assert_suffix_max!("ab", "ab", 2);
814815assert_suffix_min!("ba", "ba", 2);
816assert_suffix_max!("ba", "b", 1);
817818assert_suffix_min!("abc", "a", 1);
819assert_suffix_max!("abc", "abc", 3);
820821assert_suffix_min!("acb", "a", 1);
822assert_suffix_max!("acb", "ac", 2);
823824assert_suffix_min!("cba", "cba", 3);
825assert_suffix_max!("cba", "c", 1);
826827assert_suffix_min!("abcabc", "abca", 3);
828assert_suffix_max!("abcabc", "abcabc", 3);
829830assert_suffix_min!("abcabcabc", "abcabca", 3);
831assert_suffix_max!("abcabcabc", "abcabcabc", 3);
832833assert_suffix_min!("abczz", "a", 1);
834assert_suffix_max!("abczz", "abczz", 5);
835836assert_suffix_min!("zzabc", "zza", 3);
837assert_suffix_max!("zzabc", "zz", 1);
838839assert_suffix_min!("aaa", "aaa", 1);
840assert_suffix_max!("aaa", "aaa", 1);
841 }
842843#[cfg(not(miri))]
844quickcheck::quickcheck! {
845fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool {
846if bytes.is_empty() {
847return true;
848 }
849850let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal);
851let expected = naive_maximal_suffix_forward(&bytes);
852 got == expected
853 }
854855fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool {
856if bytes.is_empty() {
857return true;
858 }
859860let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal);
861let expected = naive_maximal_suffix_reverse(&bytes);
862 expected == got
863 }
864 }
865866// This is a regression test caught by quickcheck that exercised a bug in
867 // the reverse small period handling. The bug was that we were using 'if j
868 // == shift' to determine if a match occurred, but the correct guard is 'if
869 // j >= shift', which matches the corresponding guard in the forward impl.
870#[test]
871fn regression_rev_small_period() {
872let rfind = |h, n| FinderRev::new(n).rfind(h, n);
873let haystack = "ababaz";
874let needle = "abab";
875assert_eq!(Some(0), rfind(haystack.as_bytes(), needle.as_bytes()));
876 }
877}