1use alloc::{vec, vec::Vec};
23use regex_syntax::hir::Hir;
45use crate::{meta::regex::RegexInfo, util::search::MatchKind};
67/// Pull out an alternation of literals from the given sequence of HIR
8/// expressions.
9///
10/// There are numerous ways for this to fail. Generally, this only applies
11/// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there
12/// are "too few" alternates, in which case, the regex engine is likely faster.
13///
14/// And currently, this only returns something when 'hirs.len() == 1'.
15pub(crate) fn alternation_literals(
16 info: &RegexInfo,
17 hirs: &[&Hir],
18) -> Option<Vec<Vec<u8>>> {
19use regex_syntax::hir::{HirKind, Literal};
2021// Might as well skip the work below if we know we can't build an
22 // Aho-Corasick searcher.
23if !truecfg!(feature = "perf-literal-multisubstring") {
24return None;
25 }
26// This is pretty hacky, but basically, if `is_alternation_literal` is
27 // true, then we can make several assumptions about the structure of our
28 // HIR. This is what justifies the `unreachable!` statements below.
29if hirs.len() != 1
30|| !info.props()[0].look_set().is_empty()
31 || info.props()[0].explicit_captures_len() > 0
32|| !info.props()[0].is_alternation_literal()
33 || info.config().get_match_kind() != MatchKind::LeftmostFirst34 {
35return None;
36 }
37let hir = &hirs[0];
38let alts = match *hir.kind() {
39 HirKind::Alternation(ref alts) => alts,
40_ => return None, // one literal isn't worth it
41};
4243let mut lits = ::alloc::vec::Vec::new()vec![];
44for alt in alts {
45let mut lit = ::alloc::vec::Vec::new()vec![];
46match *alt.kind() {
47 HirKind::Literal(Literal(ref bytes)) => {
48 lit.extend_from_slice(bytes)
49 }
50 HirKind::Concat(ref exprs) => {
51for e in exprs {
52match *e.kind() {
53 HirKind::Literal(Literal(ref bytes)) => {
54 lit.extend_from_slice(bytes);
55 }
56_ => {
::core::panicking::panic_fmt(format_args!("internal error: entered unreachable code: {0}",
format_args!("expected literal, got {0:?}", e)));
}unreachable!("expected literal, got {e:?}"),
57 }
58 }
59 }
60_ => {
::core::panicking::panic_fmt(format_args!("internal error: entered unreachable code: {0}",
format_args!("expected literal or concat, got {0:?}", alt)));
}unreachable!("expected literal or concat, got {alt:?}"),
61 }
62 lits.push(lit);
63 }
64// Why do this? Well, when the number of literals is small, it's likely
65 // that we'll use the lazy DFA which is in turn likely to be faster than
66 // Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have
67 // a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use
68 // the latter because it is so hungry (in time and space), and the former
69 // is decently fast, but not as fast as a well oiled lazy DFA.
70 //
71 // However, once the number starts getting large, the lazy DFA is likely
72 // to start thrashing because of the modest default cache size. When
73 // exactly does this happen? Dunno. But at whatever point that is (we make
74 // a guess below based on ad hoc benchmarking), we'll want to cut over to
75 // Aho-Corasick, where even the contiguous NFA is likely to do much better.
76if lits.len() < 3000 {
77debug!("skipping Aho-Corasick because there are too few literals");
78return None;
79 }
80Some(lits)
81}