1/*!
2Defines a high-level intermediate (HIR) representation for regular expressions.
34The HIR is represented by the [`Hir`] type, and it principally constructed via
5[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users
6may use the smart constructors defined on `Hir` to build their own by hand. The
7smart constructors simultaneously simplify and "optimize" the HIR, and are also
8the same routines used by translation.
910Most regex engines only have an HIR like this, and usually construct it
11directly from the concrete syntax. This crate however first parses the
12concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`,
13as mentioned above. It's done this way to facilitate better error reporting,
14and to have a structured representation of a regex that faithfully represents
15its concrete syntax. Namely, while an `Hir` value can be converted back to an
16equivalent regex pattern string, it is unlikely to look like the original due
17to its simplified structure.
18*/
1920use core::{char, cmp};
2122use alloc::{
23boxed::Box,
24format,
25 string::{String, ToString},
26vec,
27vec::Vec,
28};
2930use crate::{
31ast::Span,
32 hir::interval::{Interval, IntervalSet, IntervalSetIter},
33unicode,
34};
3536pub use crate::{
37 hir::visitor::{visit, Visitor},
38unicode::CaseFoldError,
39};
4041mod interval;
42pub mod literal;
43pub mod print;
44pub mod translate;
45mod visitor;
4647/// An error that can occur while translating an `Ast` to a `Hir`.
48#[derive(#[automatically_derived]
impl ::core::clone::Clone for Error {
#[inline]
fn clone(&self) -> Error {
Error {
kind: ::core::clone::Clone::clone(&self.kind),
pattern: ::core::clone::Clone::clone(&self.pattern),
span: ::core::clone::Clone::clone(&self.span),
}
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for Error {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field3_finish(f, "Error", "kind",
&self.kind, "pattern", &self.pattern, "span", &&self.span)
}
}Debug, #[automatically_derived]
impl ::core::cmp::Eq for Error {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<ErrorKind>;
let _: ::core::cmp::AssertParamIsEq<String>;
let _: ::core::cmp::AssertParamIsEq<Span>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for Error {
#[inline]
fn eq(&self, other: &Error) -> bool {
self.kind == other.kind && self.pattern == other.pattern &&
self.span == other.span
}
}PartialEq)]
49pub struct Error {
50/// The kind of error.
51kind: ErrorKind,
52/// The original pattern that the translator's Ast was parsed from. Every
53 /// span in an error is a valid range into this string.
54pattern: String,
55/// The span of this error, derived from the Ast given to the translator.
56span: Span,
57}
5859impl Error {
60/// Return the type of this error.
61pub fn kind(&self) -> &ErrorKind {
62&self.kind
63 }
6465/// The original pattern string in which this error occurred.
66 ///
67 /// Every span reported by this error is reported in terms of this string.
68pub fn pattern(&self) -> &str {
69&self.pattern
70 }
7172/// Return the span at which this error occurred.
73pub fn span(&self) -> &Span {
74&self.span
75 }
76}
7778/// The type of an error that occurred while building an `Hir`.
79///
80/// This error type is marked as `non_exhaustive`. This means that adding a
81/// new variant is not considered a breaking change.
82#[non_exhaustive]
83#[derive(#[automatically_derived]
impl ::core::clone::Clone for ErrorKind {
#[inline]
fn clone(&self) -> ErrorKind {
match self {
ErrorKind::UnicodeNotAllowed => ErrorKind::UnicodeNotAllowed,
ErrorKind::InvalidUtf8 => ErrorKind::InvalidUtf8,
ErrorKind::InvalidLineTerminator =>
ErrorKind::InvalidLineTerminator,
ErrorKind::UnicodePropertyNotFound =>
ErrorKind::UnicodePropertyNotFound,
ErrorKind::UnicodePropertyValueNotFound =>
ErrorKind::UnicodePropertyValueNotFound,
ErrorKind::UnicodePerlClassNotFound =>
ErrorKind::UnicodePerlClassNotFound,
ErrorKind::UnicodeCaseUnavailable =>
ErrorKind::UnicodeCaseUnavailable,
}
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for ErrorKind {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::write_str(f,
match self {
ErrorKind::UnicodeNotAllowed => "UnicodeNotAllowed",
ErrorKind::InvalidUtf8 => "InvalidUtf8",
ErrorKind::InvalidLineTerminator => "InvalidLineTerminator",
ErrorKind::UnicodePropertyNotFound =>
"UnicodePropertyNotFound",
ErrorKind::UnicodePropertyValueNotFound =>
"UnicodePropertyValueNotFound",
ErrorKind::UnicodePerlClassNotFound =>
"UnicodePerlClassNotFound",
ErrorKind::UnicodeCaseUnavailable => "UnicodeCaseUnavailable",
})
}
}Debug, #[automatically_derived]
impl ::core::cmp::Eq for ErrorKind {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for ErrorKind {
#[inline]
fn eq(&self, other: &ErrorKind) -> bool {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
__self_discr == __arg1_discr
}
}PartialEq)]
84pub enum ErrorKind {
85/// This error occurs when a Unicode feature is used when Unicode
86 /// support is disabled. For example `(?-u:\pL)` would trigger this error.
87UnicodeNotAllowed,
88/// This error occurs when translating a pattern that could match a byte
89 /// sequence that isn't UTF-8 and `utf8` was enabled.
90InvalidUtf8,
91/// This error occurs when one uses a non-ASCII byte for a line terminator,
92 /// but where Unicode mode is enabled and UTF-8 mode is disabled.
93InvalidLineTerminator,
94/// This occurs when an unrecognized Unicode property name could not
95 /// be found.
96UnicodePropertyNotFound,
97/// This occurs when an unrecognized Unicode property value could not
98 /// be found.
99UnicodePropertyValueNotFound,
100/// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
101 /// `\d`) could not be found. This can occur when the `unicode-perl`
102 /// crate feature is not enabled.
103UnicodePerlClassNotFound,
104/// This occurs when the Unicode simple case mapping tables are not
105 /// available, and the regular expression required Unicode aware case
106 /// insensitivity.
107UnicodeCaseUnavailable,
108}
109110#[cfg(feature = "std")]
111impl std::error::Errorfor Error {}
112113impl core::fmt::Displayfor Error {
114fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
115crate::error::Formatter::from(self).fmt(f)
116 }
117}
118119impl core::fmt::Displayfor ErrorKind {
120fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
121use self::ErrorKind::*;
122123let msg = match *self {
124UnicodeNotAllowed => "Unicode not allowed here",
125InvalidUtf8 => "pattern can match invalid UTF-8",
126InvalidLineTerminator => "invalid line terminator, must be ASCII",
127UnicodePropertyNotFound => "Unicode property not found",
128UnicodePropertyValueNotFound => "Unicode property value not found",
129UnicodePerlClassNotFound => {
130"Unicode-aware Perl class not found \
131 (make sure the unicode-perl feature is enabled)"
132}
133UnicodeCaseUnavailable => {
134"Unicode-aware case insensitivity matching is not available \
135 (make sure the unicode-case feature is enabled)"
136}
137 };
138f.write_str(msg)
139 }
140}
141142/// A high-level intermediate representation (HIR) for a regular expression.
143///
144/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`].
145/// An `HirKind` indicates what kind of regular expression it is (a literal,
146/// a repetition, a look-around assertion, etc.), where as a `Properties`
147/// describes various facts about the regular expression. For example, whether
148/// it matches UTF-8 or if it matches the empty string.
149///
150/// The HIR of a regular expression represents an intermediate step between
151/// its abstract syntax (a structured description of the concrete syntax) and
152/// an actual regex matcher. The purpose of HIR is to make regular expressions
153/// easier to analyze. In particular, the AST is much more complex than the
154/// HIR. For example, while an AST supports arbitrarily nested character
155/// classes, the HIR will flatten all nested classes into a single set. The HIR
156/// will also "compile away" every flag present in the concrete syntax. For
157/// example, users of HIR expressions never need to worry about case folding;
158/// it is handled automatically by the translator (e.g., by translating
159/// `(?i:A)` to `[aA]`).
160///
161/// The specific type of an HIR expression can be accessed via its `kind`
162/// or `into_kind` methods. This extra level of indirection exists for two
163/// reasons:
164///
165/// 1. Construction of an HIR expression *must* use the constructor methods on
166/// this `Hir` type instead of building the `HirKind` values directly. This
167/// permits construction to enforce invariants like "concatenations always
168/// consist of two or more sub-expressions."
169/// 2. Every HIR expression contains attributes that are defined inductively,
170/// and can be computed cheaply during the construction process. For example,
171/// one such attribute is whether the expression must match at the beginning of
172/// the haystack.
173///
174/// In particular, if you have an `HirKind` value, then there is intentionally
175/// no way to build an `Hir` value from it. You instead need to do case
176/// analysis on the `HirKind` value and build the `Hir` value using its smart
177/// constructors.
178///
179/// # UTF-8
180///
181/// If the HIR was produced by a translator with
182/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled,
183/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty
184/// matches.
185///
186/// For empty matches, those can occur at any position. It is the
187/// responsibility of the regex engine to determine whether empty matches are
188/// permitted between the code units of a single codepoint.
189///
190/// # Stack space
191///
192/// This type defines its own destructor that uses constant stack space and
193/// heap space proportional to the size of the HIR.
194///
195/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
196/// expression pattern string, and uses constant stack space and heap space
197/// proportional to the size of the `Hir`. The regex it prints is guaranteed to
198/// be _semantically_ equivalent to the original concrete syntax, but it may
199/// look very different. (And potentially not practically readable by a human.)
200///
201/// An `Hir`'s `fmt::Debug` implementation currently does not use constant
202/// stack space. The implementation will also suppress some details (such as
203/// the `Properties` inlined into every `Hir` value to make it less noisy).
204#[derive(#[automatically_derived]
impl ::core::clone::Clone for Hir {
#[inline]
fn clone(&self) -> Hir {
Hir {
kind: ::core::clone::Clone::clone(&self.kind),
props: ::core::clone::Clone::clone(&self.props),
}
}
}Clone, #[automatically_derived]
impl ::core::cmp::Eq for Hir {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<HirKind>;
let _: ::core::cmp::AssertParamIsEq<Properties>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for Hir {
#[inline]
fn eq(&self, other: &Hir) -> bool {
self.kind == other.kind && self.props == other.props
}
}PartialEq)]
205pub struct Hir {
206/// The underlying HIR kind.
207kind: HirKind,
208/// Analysis info about this HIR, computed during construction.
209props: Properties,
210}
211212/// Methods for accessing the underlying `HirKind` and `Properties`.
213impl Hir {
214/// Returns a reference to the underlying HIR kind.
215pub fn kind(&self) -> &HirKind {
216&self.kind
217 }
218219/// Consumes ownership of this HIR expression and returns its underlying
220 /// `HirKind`.
221pub fn into_kind(mut self) -> HirKind {
222 core::mem::replace(&mut self.kind, HirKind::Empty)
223 }
224225/// Returns the properties computed for this `Hir`.
226pub fn properties(&self) -> &Properties {
227&self.props
228 }
229230/// Splits this HIR into its constituent parts.
231 ///
232 /// This is useful because `let Hir { kind, props } = hir;` does not work
233 /// because of `Hir`'s custom `Drop` implementation.
234fn into_parts(mut self) -> (HirKind, Properties) {
235 (
236 core::mem::replace(&mut self.kind, HirKind::Empty),
237 core::mem::replace(&mut self.props, Properties::empty()),
238 )
239 }
240}
241242/// Smart constructors for HIR values.
243///
244/// These constructors are called "smart" because they do inductive work or
245/// simplifications. For example, calling `Hir::repetition` with a repetition
246/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind
247/// since it is equivalent to an empty regex. Another example is calling
248/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll
249/// just get back the original `expr` since it's precisely equivalent.
250///
251/// Smart constructors enable maintaining invariants about the HIR data type
252/// while also simultaneously keeping the representation as simple as possible.
253impl Hir {
254/// Returns an empty HIR expression.
255 ///
256 /// An empty HIR expression always matches, including the empty string.
257#[inline]
258pub fn empty() -> Hir {
259let props = Properties::empty();
260Hir { kind: HirKind::Empty, props }
261 }
262263/// Returns an HIR expression that can never match anything. That is,
264 /// the size of the set of strings in the language described by the HIR
265 /// returned is `0`.
266 ///
267 /// This is distinct from [`Hir::empty`] in that the empty string matches
268 /// the HIR returned by `Hir::empty`. That is, the set of strings in the
269 /// language describe described by `Hir::empty` is non-empty.
270 ///
271 /// Note that currently, the HIR returned uses an empty character class to
272 /// indicate that nothing can match. An equivalent expression that cannot
273 /// match is an empty alternation, but all such "fail" expressions are
274 /// normalized (via smart constructors) to empty character classes. This is
275 /// because empty character classes can be spelled in the concrete syntax
276 /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but
277 /// empty alternations cannot.
278#[inline]
279pub fn fail() -> Hir {
280let class = Class::Bytes(ClassBytes::empty());
281let props = Properties::class(&class);
282// We can't just call Hir::class here because it defers to Hir::fail
283 // in order to canonicalize the Hir value used to represent "cannot
284 // match."
285Hir { kind: HirKind::Class(class), props }
286 }
287288/// Creates a literal HIR expression.
289 ///
290 /// This accepts anything that can be converted into a `Box<[u8]>`.
291 ///
292 /// Note that there is no mechanism for storing a `char` or a `Box<str>`
293 /// in an HIR. Everything is "just bytes." Whether a `Literal` (or
294 /// any HIR node) matches valid UTF-8 exclusively can be queried via
295 /// [`Properties::is_utf8`].
296 ///
297 /// # Example
298 ///
299 /// This example shows that concatenations of `Literal` HIR values will
300 /// automatically get flattened and combined together. So for example, even
301 /// if you concat multiple `Literal` values that are themselves not valid
302 /// UTF-8, they might add up to valid UTF-8. This also demonstrates just
303 /// how "smart" Hir's smart constructors are.
304 ///
305 /// ```
306 /// use regex_syntax::hir::{Hir, HirKind, Literal};
307 ///
308 /// let literals = vec![
309 /// Hir::literal([0xE2]),
310 /// Hir::literal([0x98]),
311 /// Hir::literal([0x83]),
312 /// ];
313 /// // Each literal, on its own, is invalid UTF-8.
314 /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8()));
315 ///
316 /// let concat = Hir::concat(literals);
317 /// // But the concatenation is valid UTF-8!
318 /// assert!(concat.properties().is_utf8());
319 ///
320 /// // And also notice that the literals have been concatenated into a
321 /// // single `Literal`, to the point where there is no explicit `Concat`!
322 /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
323 /// assert_eq!(&expected, concat.kind());
324 /// ```
325 ///
326 /// # Example: building a literal from a `char`
327 ///
328 /// This example shows how to build a single `Hir` literal from a `char`
329 /// value. Since a [`Literal`] is just bytes, we just need to UTF-8
330 /// encode a `char` value:
331 ///
332 /// ```
333 /// use regex_syntax::hir::{Hir, HirKind, Literal};
334 ///
335 /// let ch = '☃';
336 /// let got = Hir::literal(ch.encode_utf8(&mut [0; 4]).as_bytes());
337 ///
338 /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
339 /// assert_eq!(&expected, got.kind());
340 /// ```
341#[inline]
342pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
343let bytes = lit.into();
344if bytes.is_empty() {
345return Hir::empty();
346 }
347348let lit = Literal(bytes);
349let props = Properties::literal(&lit);
350Hir { kind: HirKind::Literal(lit), props }
351 }
352353/// Creates a class HIR expression. The class may either be defined over
354 /// ranges of Unicode codepoints or ranges of raw byte values.
355 ///
356 /// Note that an empty class is permitted. An empty class is equivalent to
357 /// `Hir::fail()`.
358#[inline]
359pub fn class(class: Class) -> Hir {
360if class.is_empty() {
361return Hir::fail();
362 } else if let Some(bytes) = class.literal() {
363return Hir::literal(bytes);
364 }
365let props = Properties::class(&class);
366Hir { kind: HirKind::Class(class), props }
367 }
368369/// Creates a look-around assertion HIR expression.
370#[inline]
371pub fn look(look: Look) -> Hir {
372let props = Properties::look(look);
373Hir { kind: HirKind::Look(look), props }
374 }
375376/// Creates a repetition HIR expression.
377#[inline]
378pub fn repetition(mut rep: Repetition) -> Hir {
379// If the sub-expression of a repetition can only match the empty
380 // string, then we force its maximum to be at most 1.
381if rep.sub.properties().maximum_len() == Some(0) {
382rep.min = cmp::min(rep.min, 1);
383rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1));
384 }
385// The regex 'a{0}' is always equivalent to the empty regex. This is
386 // true even when 'a' is an expression that never matches anything
387 // (like '\P{any}').
388 //
389 // Additionally, the regex 'a{1}' is always equivalent to 'a'.
390if rep.min == 0 && rep.max == Some(0) {
391return Hir::empty();
392 } else if rep.min == 1 && rep.max == Some(1) {
393return *rep.sub;
394 }
395let props = Properties::repetition(&rep);
396Hir { kind: HirKind::Repetition(rep), props }
397 }
398399/// Creates a capture HIR expression.
400 ///
401 /// Note that there is no explicit HIR value for a non-capturing group.
402 /// Since a non-capturing group only exists to override precedence in the
403 /// concrete syntax and since an HIR already does its own grouping based on
404 /// what is parsed, there is no need to explicitly represent non-capturing
405 /// groups in the HIR.
406#[inline]
407pub fn capture(capture: Capture) -> Hir {
408let props = Properties::capture(&capture);
409Hir { kind: HirKind::Capture(capture), props }
410 }
411412/// Returns the concatenation of the given expressions.
413 ///
414 /// This attempts to flatten and simplify the concatenation as appropriate.
415 ///
416 /// # Example
417 ///
418 /// This shows a simple example of basic flattening of both concatenations
419 /// and literals.
420 ///
421 /// ```
422 /// use regex_syntax::hir::Hir;
423 ///
424 /// let hir = Hir::concat(vec![
425 /// Hir::concat(vec![
426 /// Hir::literal([b'a']),
427 /// Hir::literal([b'b']),
428 /// Hir::literal([b'c']),
429 /// ]),
430 /// Hir::concat(vec![
431 /// Hir::literal([b'x']),
432 /// Hir::literal([b'y']),
433 /// Hir::literal([b'z']),
434 /// ]),
435 /// ]);
436 /// let expected = Hir::literal("abcxyz".as_bytes());
437 /// assert_eq!(expected, hir);
438 /// ```
439pub fn concat(subs: Vec<Hir>) -> Hir {
440// We rebuild the concatenation by simplifying it. Would be nice to do
441 // it in place, but that seems a little tricky?
442let mut new = ::alloc::vec::Vec::new()vec![];
443// This gobbles up any adjacent literals in a concatenation and smushes
444 // them together. Basically, when we see a literal, we add its bytes
445 // to 'prior_lit', and whenever we see anything else, we first take
446 // any bytes in 'prior_lit' and add it to the 'new' concatenation.
447let mut prior_lit: Option<Vec<u8>> = None;
448for sub in subs {
449let (kind, props) = sub.into_parts();
450match kind {
451 HirKind::Literal(Literal(bytes)) => {
452if let Some(ref mut prior_bytes) = prior_lit {
453 prior_bytes.extend_from_slice(&bytes);
454 } else {
455 prior_lit = Some(bytes.to_vec());
456 }
457 }
458// We also flatten concats that are direct children of another
459 // concat. We only need to do this one level deep since
460 // Hir::concat is the only way to build concatenations, and so
461 // flattening happens inductively.
462HirKind::Concat(subs2) => {
463for sub2 in subs2 {
464let (kind2, props2) = sub2.into_parts();
465match kind2 {
466 HirKind::Literal(Literal(bytes)) => {
467if let Some(ref mut prior_bytes) = prior_lit {
468 prior_bytes.extend_from_slice(&bytes);
469 } else {
470 prior_lit = Some(bytes.to_vec());
471 }
472 }
473 kind2 => {
474if let Some(prior_bytes) = prior_lit.take() {
475 new.push(Hir::literal(prior_bytes));
476 }
477 new.push(Hir { kind: kind2, props: props2 });
478 }
479 }
480 }
481 }
482// We can just skip empty HIRs.
483HirKind::Empty => {}
484 kind => {
485if let Some(prior_bytes) = prior_lit.take() {
486 new.push(Hir::literal(prior_bytes));
487 }
488 new.push(Hir { kind, props });
489 }
490 }
491 }
492if let Some(prior_bytes) = prior_lit.take() {
493new.push(Hir::literal(prior_bytes));
494 }
495if new.is_empty() {
496return Hir::empty();
497 } else if new.len() == 1 {
498return new.pop().unwrap();
499 }
500let props = Properties::concat(&new);
501Hir { kind: HirKind::Concat(new), props }
502 }
503504/// Returns the alternation of the given expressions.
505 ///
506 /// This flattens and simplifies the alternation as appropriate. This may
507 /// include factoring out common prefixes or even rewriting the alternation
508 /// as a character class.
509 ///
510 /// Note that an empty alternation is equivalent to `Hir::fail()`. (It
511 /// is not possible for one to write an empty alternation, or even an
512 /// alternation with a single sub-expression, in the concrete syntax of a
513 /// regex.)
514 ///
515 /// # Example
516 ///
517 /// This is a simple example showing how an alternation might get
518 /// simplified.
519 ///
520 /// ```
521 /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
522 ///
523 /// let hir = Hir::alternation(vec![
524 /// Hir::literal([b'a']),
525 /// Hir::literal([b'b']),
526 /// Hir::literal([b'c']),
527 /// Hir::literal([b'd']),
528 /// Hir::literal([b'e']),
529 /// Hir::literal([b'f']),
530 /// ]);
531 /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([
532 /// ClassUnicodeRange::new('a', 'f'),
533 /// ])));
534 /// assert_eq!(expected, hir);
535 /// ```
536 ///
537 /// And another example showing how common prefixes might get factored
538 /// out.
539 ///
540 /// ```
541 /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
542 ///
543 /// let hir = Hir::alternation(vec![
544 /// Hir::concat(vec![
545 /// Hir::literal("abc".as_bytes()),
546 /// Hir::class(Class::Unicode(ClassUnicode::new([
547 /// ClassUnicodeRange::new('A', 'Z'),
548 /// ]))),
549 /// ]),
550 /// Hir::concat(vec![
551 /// Hir::literal("abc".as_bytes()),
552 /// Hir::class(Class::Unicode(ClassUnicode::new([
553 /// ClassUnicodeRange::new('a', 'z'),
554 /// ]))),
555 /// ]),
556 /// ]);
557 /// let expected = Hir::concat(vec![
558 /// Hir::literal("abc".as_bytes()),
559 /// Hir::alternation(vec![
560 /// Hir::class(Class::Unicode(ClassUnicode::new([
561 /// ClassUnicodeRange::new('A', 'Z'),
562 /// ]))),
563 /// Hir::class(Class::Unicode(ClassUnicode::new([
564 /// ClassUnicodeRange::new('a', 'z'),
565 /// ]))),
566 /// ]),
567 /// ]);
568 /// assert_eq!(expected, hir);
569 /// ```
570 ///
571 /// Note that these sorts of simplifications are not guaranteed.
572pub fn alternation(subs: Vec<Hir>) -> Hir {
573// We rebuild the alternation by simplifying it. We proceed similarly
574 // as the concatenation case. But in this case, there's no literal
575 // simplification happening. We're just flattening alternations.
576let mut new = Vec::with_capacity(subs.len());
577for sub in subs {
578let (kind, props) = sub.into_parts();
579match kind {
580 HirKind::Alternation(subs2) => {
581 new.extend(subs2);
582 }
583 kind => {
584 new.push(Hir { kind, props });
585 }
586 }
587 }
588if new.is_empty() {
589return Hir::fail();
590 } else if new.len() == 1 {
591return new.pop().unwrap();
592 }
593// Now that it's completely flattened, look for the special case of
594 // 'char1|char2|...|charN' and collapse that into a class. Note that
595 // we look for 'char' first and then bytes. The issue here is that if
596 // we find both non-ASCII codepoints and non-ASCII singleton bytes,
597 // then it isn't actually possible to smush them into a single class.
598 // (Because classes are either "all codepoints" or "all bytes." You
599 // can have a class that both matches non-ASCII but valid UTF-8 and
600 // invalid UTF-8.) So we look for all chars and then all bytes, and
601 // don't handle anything else.
602if let Some(singletons) = singleton_chars(&new) {
603let it = singletons604 .into_iter()
605 .map(|ch| ClassUnicodeRange { start: ch, end: ch });
606return Hir::class(Class::Unicode(ClassUnicode::new(it)));
607 }
608if let Some(singletons) = singleton_bytes(&new) {
609let it = singletons610 .into_iter()
611 .map(|b| ClassBytesRange { start: b, end: b });
612return Hir::class(Class::Bytes(ClassBytes::new(it)));
613 }
614// Similar to singleton chars, we can also look for alternations of
615 // classes. Those can be smushed into a single class.
616if let Some(cls) = class_chars(&new) {
617return Hir::class(cls);
618 }
619if let Some(cls) = class_bytes(&new) {
620return Hir::class(cls);
621 }
622// Factor out a common prefix if we can, which might potentially
623 // simplify the expression and unlock other optimizations downstream.
624 // It also might generally make NFA matching and DFA construction
625 // faster by reducing the scope of branching in the regex.
626new = match lift_common_prefix(new) {
627Ok(hir) => return hir,
628Err(unchanged) => unchanged,
629 };
630let props = Properties::alternation(&new);
631Hir { kind: HirKind::Alternation(new), props }
632 }
633634/// Returns an HIR expression for `.`.
635 ///
636 /// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
637 /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
638 /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
639 /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
640 /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
641 /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
642 ///
643 /// # Example
644 ///
645 /// Note that this is a convenience routine for constructing the correct
646 /// character class based on the value of `Dot`. There is no explicit "dot"
647 /// HIR value. It is just an abbreviation for a common character class.
648 ///
649 /// ```
650 /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange};
651 ///
652 /// let hir = Hir::dot(Dot::AnyByte);
653 /// let expected = Hir::class(Class::Bytes(ClassBytes::new([
654 /// ClassBytesRange::new(0x00, 0xFF),
655 /// ])));
656 /// assert_eq!(expected, hir);
657 /// ```
658#[inline]
659pub fn dot(dot: Dot) -> Hir {
660match dot {
661 Dot::AnyChar => Hir::class(Class::Unicode(ClassUnicode::new([
662ClassUnicodeRange::new('\0', '\u{10FFFF}'),
663 ]))),
664 Dot::AnyByte => Hir::class(Class::Bytes(ClassBytes::new([
665ClassBytesRange::new(b'\0', b'\xFF'),
666 ]))),
667 Dot::AnyCharExcept(ch) => {
668let mut cls =
669ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
670cls.negate();
671Hir::class(Class::Unicode(cls))
672 }
673 Dot::AnyCharExceptLF => {
674Hir::class(Class::Unicode(ClassUnicode::new([
675ClassUnicodeRange::new('\0', '\x09'),
676ClassUnicodeRange::new('\x0B', '\u{10FFFF}'),
677 ])))
678 }
679 Dot::AnyCharExceptCRLF => {
680Hir::class(Class::Unicode(ClassUnicode::new([
681ClassUnicodeRange::new('\0', '\x09'),
682ClassUnicodeRange::new('\x0B', '\x0C'),
683ClassUnicodeRange::new('\x0E', '\u{10FFFF}'),
684 ])))
685 }
686 Dot::AnyByteExcept(byte) => {
687let mut cls =
688ClassBytes::new([ClassBytesRange::new(byte, byte)]);
689cls.negate();
690Hir::class(Class::Bytes(cls))
691 }
692 Dot::AnyByteExceptLF => {
693Hir::class(Class::Bytes(ClassBytes::new([
694ClassBytesRange::new(b'\0', b'\x09'),
695ClassBytesRange::new(b'\x0B', b'\xFF'),
696 ])))
697 }
698 Dot::AnyByteExceptCRLF => {
699Hir::class(Class::Bytes(ClassBytes::new([
700ClassBytesRange::new(b'\0', b'\x09'),
701ClassBytesRange::new(b'\x0B', b'\x0C'),
702ClassBytesRange::new(b'\x0E', b'\xFF'),
703 ])))
704 }
705 }
706 }
707}
708709/// The underlying kind of an arbitrary [`Hir`] expression.
710///
711/// An `HirKind` is principally useful for doing case analysis on the type
712/// of a regular expression. If you're looking to build new `Hir` values,
713/// then you _must_ use the smart constructors defined on `Hir`, like
714/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does
715/// not expose any way of building an `Hir` directly from an `HirKind`.
716#[derive(#[automatically_derived]
impl ::core::clone::Clone for HirKind {
#[inline]
fn clone(&self) -> HirKind {
match self {
HirKind::Empty => HirKind::Empty,
HirKind::Literal(__self_0) =>
HirKind::Literal(::core::clone::Clone::clone(__self_0)),
HirKind::Class(__self_0) =>
HirKind::Class(::core::clone::Clone::clone(__self_0)),
HirKind::Look(__self_0) =>
HirKind::Look(::core::clone::Clone::clone(__self_0)),
HirKind::Repetition(__self_0) =>
HirKind::Repetition(::core::clone::Clone::clone(__self_0)),
HirKind::Capture(__self_0) =>
HirKind::Capture(::core::clone::Clone::clone(__self_0)),
HirKind::Concat(__self_0) =>
HirKind::Concat(::core::clone::Clone::clone(__self_0)),
HirKind::Alternation(__self_0) =>
HirKind::Alternation(::core::clone::Clone::clone(__self_0)),
}
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for HirKind {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
match self {
HirKind::Empty => ::core::fmt::Formatter::write_str(f, "Empty"),
HirKind::Literal(__self_0) =>
::core::fmt::Formatter::debug_tuple_field1_finish(f,
"Literal", &__self_0),
HirKind::Class(__self_0) =>
::core::fmt::Formatter::debug_tuple_field1_finish(f, "Class",
&__self_0),
HirKind::Look(__self_0) =>
::core::fmt::Formatter::debug_tuple_field1_finish(f, "Look",
&__self_0),
HirKind::Repetition(__self_0) =>
::core::fmt::Formatter::debug_tuple_field1_finish(f,
"Repetition", &__self_0),
HirKind::Capture(__self_0) =>
::core::fmt::Formatter::debug_tuple_field1_finish(f,
"Capture", &__self_0),
HirKind::Concat(__self_0) =>
::core::fmt::Formatter::debug_tuple_field1_finish(f, "Concat",
&__self_0),
HirKind::Alternation(__self_0) =>
::core::fmt::Formatter::debug_tuple_field1_finish(f,
"Alternation", &__self_0),
}
}
}Debug, #[automatically_derived]
impl ::core::cmp::Eq for HirKind {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<Literal>;
let _: ::core::cmp::AssertParamIsEq<Class>;
let _: ::core::cmp::AssertParamIsEq<Look>;
let _: ::core::cmp::AssertParamIsEq<Repetition>;
let _: ::core::cmp::AssertParamIsEq<Capture>;
let _: ::core::cmp::AssertParamIsEq<Vec<Hir>>;
let _: ::core::cmp::AssertParamIsEq<Vec<Hir>>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for HirKind {
#[inline]
fn eq(&self, other: &HirKind) -> bool {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
__self_discr == __arg1_discr &&
match (self, other) {
(HirKind::Literal(__self_0), HirKind::Literal(__arg1_0)) =>
__self_0 == __arg1_0,
(HirKind::Class(__self_0), HirKind::Class(__arg1_0)) =>
__self_0 == __arg1_0,
(HirKind::Look(__self_0), HirKind::Look(__arg1_0)) =>
__self_0 == __arg1_0,
(HirKind::Repetition(__self_0), HirKind::Repetition(__arg1_0))
=> __self_0 == __arg1_0,
(HirKind::Capture(__self_0), HirKind::Capture(__arg1_0)) =>
__self_0 == __arg1_0,
(HirKind::Concat(__self_0), HirKind::Concat(__arg1_0)) =>
__self_0 == __arg1_0,
(HirKind::Alternation(__self_0),
HirKind::Alternation(__arg1_0)) => __self_0 == __arg1_0,
_ => true,
}
}
}PartialEq)]
717pub enum HirKind {
718/// The empty regular expression, which matches everything, including the
719 /// empty string.
720Empty,
721/// A literal string that matches exactly these bytes.
722Literal(Literal),
723/// A single character class that matches any of the characters in the
724 /// class. A class can either consist of Unicode scalar values as
725 /// characters, or it can use bytes.
726 ///
727 /// A class may be empty. In which case, it matches nothing.
728Class(Class),
729/// A look-around assertion. A look-around match always has zero length.
730Look(Look),
731/// A repetition operation applied to a sub-expression.
732Repetition(Repetition),
733/// A capturing group, which contains a sub-expression.
734Capture(Capture),
735/// A concatenation of expressions.
736 ///
737 /// A concatenation matches only if each of its sub-expressions match one
738 /// after the other.
739 ///
740 /// Concatenations are guaranteed by `Hir`'s smart constructors to always
741 /// have at least two sub-expressions.
742Concat(Vec<Hir>),
743/// An alternation of expressions.
744 ///
745 /// An alternation matches only if at least one of its sub-expressions
746 /// match. If multiple sub-expressions match, then the leftmost is
747 /// preferred.
748 ///
749 /// Alternations are guaranteed by `Hir`'s smart constructors to always
750 /// have at least two sub-expressions.
751Alternation(Vec<Hir>),
752}
753754impl HirKind {
755/// Returns a slice of this kind's sub-expressions, if any.
756pub fn subs(&self) -> &[Hir] {
757use core::slice::from_ref;
758759match *self {
760 HirKind::Empty761 | HirKind::Literal(_)
762 | HirKind::Class(_)
763 | HirKind::Look(_) => &[],
764 HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub),
765 HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub),
766 HirKind::Concat(ref subs) => subs,
767 HirKind::Alternation(ref subs) => subs,
768 }
769 }
770}
771772impl core::fmt::Debugfor Hir {
773fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
774self.kind.fmt(f)
775 }
776}
777778/// Print a display representation of this Hir.
779///
780/// The result of this is a valid regular expression pattern string.
781///
782/// This implementation uses constant stack space and heap space proportional
783/// to the size of the `Hir`.
784impl core::fmt::Displayfor Hir {
785fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
786crate::hir::print::Printer::new().print(self, f)
787 }
788}
789790/// The high-level intermediate representation of a literal.
791///
792/// A literal corresponds to `0` or more bytes that should be matched
793/// literally. The smart constructors defined on `Hir` will automatically
794/// concatenate adjacent literals into one literal, and will even automatically
795/// replace empty literals with `Hir::empty()`.
796///
797/// Note that despite a literal being represented by a sequence of bytes, its
798/// `Debug` implementation will attempt to print it as a normal string. (That
799/// is, not a sequence of decimal numbers.)
800#[derive(#[automatically_derived]
impl ::core::clone::Clone for Literal {
#[inline]
fn clone(&self) -> Literal {
Literal(::core::clone::Clone::clone(&self.0))
}
}Clone, #[automatically_derived]
impl ::core::cmp::Eq for Literal {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<Box<[u8]>>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for Literal {
#[inline]
fn eq(&self, other: &Literal) -> bool { self.0 == other.0 }
}PartialEq)]
801pub struct Literal(pub Box<[u8]>);
802803impl core::fmt::Debugfor Literal {
804fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
805crate::debug::Bytes(&self.0).fmt(f)
806 }
807}
808809/// The high-level intermediate representation of a character class.
810///
811/// A character class corresponds to a set of characters. A character is either
812/// defined by a Unicode scalar value or a byte.
813///
814/// A character class, regardless of its character type, is represented by a
815/// sequence of non-overlapping non-adjacent ranges of characters.
816///
817/// There are no guarantees about which class variant is used. Generally
818/// speaking, the Unicode variant is used whenever a class needs to contain
819/// non-ASCII Unicode scalar values. But the Unicode variant can be used even
820/// when Unicode mode is disabled. For example, at the time of writing, the
821/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class
822/// `[a\u00A0]` due to optimizations.
823///
824/// Note that `Bytes` variant may be produced even when it exclusively matches
825/// valid UTF-8. This is because a `Bytes` variant represents an intention by
826/// the author of the regular expression to disable Unicode mode, which in turn
827/// impacts the semantics of case insensitive matching. For example, `(?i)k`
828/// and `(?i-u)k` will not match the same set of strings.
829#[derive(#[automatically_derived]
impl ::core::clone::Clone for Class {
#[inline]
fn clone(&self) -> Class {
match self {
Class::Unicode(__self_0) =>
Class::Unicode(::core::clone::Clone::clone(__self_0)),
Class::Bytes(__self_0) =>
Class::Bytes(::core::clone::Clone::clone(__self_0)),
}
}
}Clone, #[automatically_derived]
impl ::core::cmp::Eq for Class {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<ClassUnicode>;
let _: ::core::cmp::AssertParamIsEq<ClassBytes>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for Class {
#[inline]
fn eq(&self, other: &Class) -> bool {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
__self_discr == __arg1_discr &&
match (self, other) {
(Class::Unicode(__self_0), Class::Unicode(__arg1_0)) =>
__self_0 == __arg1_0,
(Class::Bytes(__self_0), Class::Bytes(__arg1_0)) =>
__self_0 == __arg1_0,
_ => unsafe { ::core::intrinsics::unreachable() }
}
}
}PartialEq)]
830pub enum Class {
831/// A set of characters represented by Unicode scalar values.
832Unicode(ClassUnicode),
833/// A set of characters represented by arbitrary bytes (one byte per
834 /// character).
835Bytes(ClassBytes),
836}
837838impl Class {
839/// Apply Unicode simple case folding to this character class, in place.
840 /// The character class will be expanded to include all simple case folded
841 /// character variants.
842 ///
843 /// If this is a byte oriented character class, then this will be limited
844 /// to the ASCII ranges `A-Z` and `a-z`.
845 ///
846 /// # Panics
847 ///
848 /// This routine panics when the case mapping data necessary for this
849 /// routine to complete is unavailable. This occurs when the `unicode-case`
850 /// feature is not enabled and the underlying class is Unicode oriented.
851 ///
852 /// Callers should prefer using `try_case_fold_simple` instead, which will
853 /// return an error instead of panicking.
854pub fn case_fold_simple(&mut self) {
855match *self {
856 Class::Unicode(ref mut x) => x.case_fold_simple(),
857 Class::Bytes(ref mut x) => x.case_fold_simple(),
858 }
859 }
860861/// Apply Unicode simple case folding to this character class, in place.
862 /// The character class will be expanded to include all simple case folded
863 /// character variants.
864 ///
865 /// If this is a byte oriented character class, then this will be limited
866 /// to the ASCII ranges `A-Z` and `a-z`.
867 ///
868 /// # Error
869 ///
870 /// This routine returns an error when the case mapping data necessary
871 /// for this routine to complete is unavailable. This occurs when the
872 /// `unicode-case` feature is not enabled and the underlying class is
873 /// Unicode oriented.
874pub fn try_case_fold_simple(
875&mut self,
876 ) -> core::result::Result<(), CaseFoldError> {
877match *self {
878 Class::Unicode(ref mut x) => x.try_case_fold_simple()?,
879 Class::Bytes(ref mut x) => x.case_fold_simple(),
880 }
881Ok(())
882 }
883884/// Negate this character class in place.
885 ///
886 /// After completion, this character class will contain precisely the
887 /// characters that weren't previously in the class.
888pub fn negate(&mut self) {
889match *self {
890 Class::Unicode(ref mut x) => x.negate(),
891 Class::Bytes(ref mut x) => x.negate(),
892 }
893 }
894895/// Returns true if and only if this character class will only ever match
896 /// valid UTF-8.
897 ///
898 /// A character class can match invalid UTF-8 only when the following
899 /// conditions are met:
900 ///
901 /// 1. The translator was configured to permit generating an expression
902 /// that can match invalid UTF-8. (By default, this is disabled.)
903 /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
904 /// syntax or in the parser builder. By default, Unicode mode is
905 /// enabled.
906pub fn is_utf8(&self) -> bool {
907match *self {
908 Class::Unicode(_) => true,
909 Class::Bytes(ref x) => x.is_ascii(),
910 }
911 }
912913/// Returns the length, in bytes, of the smallest string matched by this
914 /// character class.
915 ///
916 /// For non-empty byte oriented classes, this always returns `1`. For
917 /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
918 /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
919 /// be returned.
920 ///
921 /// # Example
922 ///
923 /// This example shows some examples of regexes and their corresponding
924 /// minimum length, if any.
925 ///
926 /// ```
927 /// use regex_syntax::{hir::Properties, parse};
928 ///
929 /// // The empty string has a min length of 0.
930 /// let hir = parse(r"")?;
931 /// assert_eq!(Some(0), hir.properties().minimum_len());
932 /// // As do other types of regexes that only match the empty string.
933 /// let hir = parse(r"^$\b\B")?;
934 /// assert_eq!(Some(0), hir.properties().minimum_len());
935 /// // A regex that can match the empty string but match more is still 0.
936 /// let hir = parse(r"a*")?;
937 /// assert_eq!(Some(0), hir.properties().minimum_len());
938 /// // A regex that matches nothing has no minimum defined.
939 /// let hir = parse(r"[a&&b]")?;
940 /// assert_eq!(None, hir.properties().minimum_len());
941 /// // Character classes usually have a minimum length of 1.
942 /// let hir = parse(r"\w")?;
943 /// assert_eq!(Some(1), hir.properties().minimum_len());
944 /// // But sometimes Unicode classes might be bigger!
945 /// let hir = parse(r"\p{Cyrillic}")?;
946 /// assert_eq!(Some(2), hir.properties().minimum_len());
947 ///
948 /// # Ok::<(), Box<dyn std::error::Error>>(())
949 /// ```
950pub fn minimum_len(&self) -> Option<usize> {
951match *self {
952 Class::Unicode(ref x) => x.minimum_len(),
953 Class::Bytes(ref x) => x.minimum_len(),
954 }
955 }
956957/// Returns the length, in bytes, of the longest string matched by this
958 /// character class.
959 ///
960 /// For non-empty byte oriented classes, this always returns `1`. For
961 /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
962 /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
963 /// be returned.
964 ///
965 /// # Example
966 ///
967 /// This example shows some examples of regexes and their corresponding
968 /// maximum length, if any.
969 ///
970 /// ```
971 /// use regex_syntax::{hir::Properties, parse};
972 ///
973 /// // The empty string has a max length of 0.
974 /// let hir = parse(r"")?;
975 /// assert_eq!(Some(0), hir.properties().maximum_len());
976 /// // As do other types of regexes that only match the empty string.
977 /// let hir = parse(r"^$\b\B")?;
978 /// assert_eq!(Some(0), hir.properties().maximum_len());
979 /// // A regex that matches nothing has no maximum defined.
980 /// let hir = parse(r"[a&&b]")?;
981 /// assert_eq!(None, hir.properties().maximum_len());
982 /// // Bounded repeats work as you expect.
983 /// let hir = parse(r"x{2,10}")?;
984 /// assert_eq!(Some(10), hir.properties().maximum_len());
985 /// // An unbounded repeat means there is no maximum.
986 /// let hir = parse(r"x{2,}")?;
987 /// assert_eq!(None, hir.properties().maximum_len());
988 /// // With Unicode enabled, \w can match up to 4 bytes!
989 /// let hir = parse(r"\w")?;
990 /// assert_eq!(Some(4), hir.properties().maximum_len());
991 /// // Without Unicode enabled, \w matches at most 1 byte.
992 /// let hir = parse(r"(?-u)\w")?;
993 /// assert_eq!(Some(1), hir.properties().maximum_len());
994 ///
995 /// # Ok::<(), Box<dyn std::error::Error>>(())
996 /// ```
997pub fn maximum_len(&self) -> Option<usize> {
998match *self {
999 Class::Unicode(ref x) => x.maximum_len(),
1000 Class::Bytes(ref x) => x.maximum_len(),
1001 }
1002 }
10031004/// Returns true if and only if this character class is empty. That is,
1005 /// it has no elements.
1006 ///
1007 /// An empty character can never match anything, including an empty string.
1008pub fn is_empty(&self) -> bool {
1009match *self {
1010 Class::Unicode(ref x) => x.ranges().is_empty(),
1011 Class::Bytes(ref x) => x.ranges().is_empty(),
1012 }
1013 }
10141015/// If this class consists of exactly one element (whether a codepoint or a
1016 /// byte), then return it as a literal byte string.
1017 ///
1018 /// If this class is empty or contains more than one element, then `None`
1019 /// is returned.
1020pub fn literal(&self) -> Option<Vec<u8>> {
1021match *self {
1022 Class::Unicode(ref x) => x.literal(),
1023 Class::Bytes(ref x) => x.literal(),
1024 }
1025 }
1026}
10271028impl core::fmt::Debugfor Class {
1029fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1030use crate::debug::Byte;
10311032let mut fmter = f.debug_set();
1033match *self {
1034 Class::Unicode(ref cls) => {
1035for r in cls.ranges().iter() {
1036 fmter.entry(&(r.start..=r.end));
1037 }
1038 }
1039 Class::Bytes(ref cls) => {
1040for r in cls.ranges().iter() {
1041 fmter.entry(&(Byte(r.start)..=Byte(r.end)));
1042 }
1043 }
1044 }
1045fmter.finish()
1046 }
1047}
10481049/// A set of characters represented by Unicode scalar values.
1050#[derive(#[automatically_derived]
impl ::core::clone::Clone for ClassUnicode {
#[inline]
fn clone(&self) -> ClassUnicode {
ClassUnicode { set: ::core::clone::Clone::clone(&self.set) }
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for ClassUnicode {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field1_finish(f, "ClassUnicode",
"set", &&self.set)
}
}Debug, #[automatically_derived]
impl ::core::cmp::Eq for ClassUnicode {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<IntervalSet<ClassUnicodeRange>>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for ClassUnicode {
#[inline]
fn eq(&self, other: &ClassUnicode) -> bool { self.set == other.set }
}PartialEq)]
1051pub struct ClassUnicode {
1052 set: IntervalSet<ClassUnicodeRange>,
1053}
10541055impl ClassUnicode {
1056/// Create a new class from a sequence of ranges.
1057 ///
1058 /// The given ranges do not need to be in any specific order, and ranges
1059 /// may overlap. Ranges will automatically be sorted into a canonical
1060 /// non-overlapping order.
1061pub fn new<I>(ranges: I) -> ClassUnicode1062where
1063I: IntoIterator<Item = ClassUnicodeRange>,
1064 {
1065ClassUnicode { set: IntervalSet::new(ranges) }
1066 }
10671068/// Create a new class with no ranges.
1069 ///
1070 /// An empty class matches nothing. That is, it is equivalent to
1071 /// [`Hir::fail`].
1072pub fn empty() -> ClassUnicode {
1073ClassUnicode::new(::alloc::vec::Vec::new()vec![])
1074 }
10751076/// Add a new range to this set.
1077pub fn push(&mut self, range: ClassUnicodeRange) {
1078self.set.push(range);
1079 }
10801081/// Return an iterator over all ranges in this class.
1082 ///
1083 /// The iterator yields ranges in ascending order.
1084pub fn iter(&self) -> ClassUnicodeIter<'_> {
1085ClassUnicodeIter(self.set.iter())
1086 }
10871088/// Return the underlying ranges as a slice.
1089pub fn ranges(&self) -> &[ClassUnicodeRange] {
1090self.set.intervals()
1091 }
10921093/// Expand this character class such that it contains all case folded
1094 /// characters, according to Unicode's "simple" mapping. For example, if
1095 /// this class consists of the range `a-z`, then applying case folding will
1096 /// result in the class containing both the ranges `a-z` and `A-Z`.
1097 ///
1098 /// # Panics
1099 ///
1100 /// This routine panics when the case mapping data necessary for this
1101 /// routine to complete is unavailable. This occurs when the `unicode-case`
1102 /// feature is not enabled.
1103 ///
1104 /// Callers should prefer using `try_case_fold_simple` instead, which will
1105 /// return an error instead of panicking.
1106pub fn case_fold_simple(&mut self) {
1107self.set
1108 .case_fold_simple()
1109 .expect("unicode-case feature must be enabled");
1110 }
11111112/// Expand this character class such that it contains all case folded
1113 /// characters, according to Unicode's "simple" mapping. For example, if
1114 /// this class consists of the range `a-z`, then applying case folding will
1115 /// result in the class containing both the ranges `a-z` and `A-Z`.
1116 ///
1117 /// # Error
1118 ///
1119 /// This routine returns an error when the case mapping data necessary
1120 /// for this routine to complete is unavailable. This occurs when the
1121 /// `unicode-case` feature is not enabled.
1122pub fn try_case_fold_simple(
1123&mut self,
1124 ) -> core::result::Result<(), CaseFoldError> {
1125self.set.case_fold_simple()
1126 }
11271128/// Negate this character class.
1129 ///
1130 /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
1131 /// set, then it will not be in this set after negation.
1132pub fn negate(&mut self) {
1133self.set.negate();
1134 }
11351136/// Union this character class with the given character class, in place.
1137pub fn union(&mut self, other: &ClassUnicode) {
1138self.set.union(&other.set);
1139 }
11401141/// Intersect this character class with the given character class, in
1142 /// place.
1143pub fn intersect(&mut self, other: &ClassUnicode) {
1144self.set.intersect(&other.set);
1145 }
11461147/// Subtract the given character class from this character class, in place.
1148pub fn difference(&mut self, other: &ClassUnicode) {
1149self.set.difference(&other.set);
1150 }
11511152/// Compute the symmetric difference of the given character classes, in
1153 /// place.
1154 ///
1155 /// This computes the symmetric difference of two character classes. This
1156 /// removes all elements in this class that are also in the given class,
1157 /// but all adds all elements from the given class that aren't in this
1158 /// class. That is, the class will contain all elements in either class,
1159 /// but will not contain any elements that are in both classes.
1160pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
1161self.set.symmetric_difference(&other.set);
1162 }
11631164/// Returns true if and only if this character class will either match
1165 /// nothing or only ASCII bytes. Stated differently, this returns false
1166 /// if and only if this class contains a non-ASCII codepoint.
1167pub fn is_ascii(&self) -> bool {
1168self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
1169 }
11701171/// Returns the length, in bytes, of the smallest string matched by this
1172 /// character class.
1173 ///
1174 /// Returns `None` when the class is empty.
1175pub fn minimum_len(&self) -> Option<usize> {
1176let first = self.ranges().get(0)?;
1177// Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1178Some(first.start.len_utf8())
1179 }
11801181/// Returns the length, in bytes, of the longest string matched by this
1182 /// character class.
1183 ///
1184 /// Returns `None` when the class is empty.
1185pub fn maximum_len(&self) -> Option<usize> {
1186let last = self.ranges().last()?;
1187// Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1188Some(last.end.len_utf8())
1189 }
11901191/// If this class consists of exactly one codepoint, then return it as
1192 /// a literal byte string.
1193 ///
1194 /// If this class is empty or contains more than one codepoint, then `None`
1195 /// is returned.
1196pub fn literal(&self) -> Option<Vec<u8>> {
1197let rs = self.ranges();
1198if rs.len() == 1 && rs[0].start == rs[0].end {
1199Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes())
1200 } else {
1201None1202 }
1203 }
12041205/// If this class consists of only ASCII ranges, then return its
1206 /// corresponding and equivalent byte class.
1207pub fn to_byte_class(&self) -> Option<ClassBytes> {
1208if !self.is_ascii() {
1209return None;
1210 }
1211Some(ClassBytes::new(self.ranges().iter().map(|r| {
1212// Since we are guaranteed that our codepoint range is ASCII, the
1213 // 'u8::try_from' calls below are guaranteed to be correct.
1214ClassBytesRange {
1215 start: u8::try_from(r.start).unwrap(),
1216 end: u8::try_from(r.end).unwrap(),
1217 }
1218 })))
1219 }
1220}
12211222/// An iterator over all ranges in a Unicode character class.
1223///
1224/// The lifetime `'a` refers to the lifetime of the underlying class.
1225#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for ClassUnicodeIter<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_tuple_field1_finish(f,
"ClassUnicodeIter", &&self.0)
}
}Debug)]
1226pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
12271228impl<'a> Iteratorfor ClassUnicodeIter<'a> {
1229type Item = &'a ClassUnicodeRange;
12301231fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
1232self.0.next()
1233 }
1234}
12351236/// A single range of characters represented by Unicode scalar values.
1237///
1238/// The range is closed. That is, the start and end of the range are included
1239/// in the range.
1240#[derive(#[automatically_derived]
impl ::core::clone::Clone for ClassUnicodeRange {
#[inline]
fn clone(&self) -> ClassUnicodeRange {
let _: ::core::clone::AssertParamIsClone<char>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for ClassUnicodeRange { }Copy, #[automatically_derived]
impl ::core::default::Default for ClassUnicodeRange {
#[inline]
fn default() -> ClassUnicodeRange {
ClassUnicodeRange {
start: ::core::default::Default::default(),
end: ::core::default::Default::default(),
}
}
}Default, #[automatically_derived]
impl ::core::cmp::Eq for ClassUnicodeRange {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<char>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for ClassUnicodeRange {
#[inline]
fn eq(&self, other: &ClassUnicodeRange) -> bool {
self.start == other.start && self.end == other.end
}
}PartialEq, #[automatically_derived]
impl ::core::cmp::PartialOrd for ClassUnicodeRange {
#[inline]
fn partial_cmp(&self, other: &ClassUnicodeRange)
-> ::core::option::Option<::core::cmp::Ordering> {
match ::core::cmp::PartialOrd::partial_cmp(&self.start, &other.start)
{
::core::option::Option::Some(::core::cmp::Ordering::Equal) =>
::core::cmp::PartialOrd::partial_cmp(&self.end, &other.end),
cmp => cmp,
}
}
}PartialOrd, #[automatically_derived]
impl ::core::cmp::Ord for ClassUnicodeRange {
#[inline]
fn cmp(&self, other: &ClassUnicodeRange) -> ::core::cmp::Ordering {
match ::core::cmp::Ord::cmp(&self.start, &other.start) {
::core::cmp::Ordering::Equal =>
::core::cmp::Ord::cmp(&self.end, &other.end),
cmp => cmp,
}
}
}Ord)]
1241pub struct ClassUnicodeRange {
1242 start: char,
1243 end: char,
1244}
12451246impl core::fmt::Debugfor ClassUnicodeRange {
1247fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1248let start = if !self.start.is_whitespace() && !self.start.is_control()
1249 {
1250self.start.to_string()
1251 } else {
1252::alloc::__export::must_use({
::alloc::fmt::format(format_args!("0x{0:X}", u32::from(self.start)))
})format!("0x{:X}", u32::from(self.start))1253 };
1254let end = if !self.end.is_whitespace() && !self.end.is_control() {
1255self.end.to_string()
1256 } else {
1257::alloc::__export::must_use({
::alloc::fmt::format(format_args!("0x{0:X}", u32::from(self.end)))
})format!("0x{:X}", u32::from(self.end))1258 };
1259f.debug_struct("ClassUnicodeRange")
1260 .field("start", &start)
1261 .field("end", &end)
1262 .finish()
1263 }
1264}
12651266impl Intervalfor ClassUnicodeRange {
1267type Bound = char;
12681269#[inline]
1270fn lower(&self) -> char {
1271self.start
1272 }
1273#[inline]
1274fn upper(&self) -> char {
1275self.end
1276 }
1277#[inline]
1278fn set_lower(&mut self, bound: char) {
1279self.start = bound;
1280 }
1281#[inline]
1282fn set_upper(&mut self, bound: char) {
1283self.end = bound;
1284 }
12851286/// Apply simple case folding to this Unicode scalar value range.
1287 ///
1288 /// Additional ranges are appended to the given vector. Canonical ordering
1289 /// is *not* maintained in the given vector.
1290fn case_fold_simple(
1291&self,
1292 ranges: &mut Vec<ClassUnicodeRange>,
1293 ) -> Result<(), unicode::CaseFoldError> {
1294let mut folder = unicode::SimpleCaseFolder::new()?;
1295if !folder.overlaps(self.start, self.end) {
1296return Ok(());
1297 }
1298let (start, end) = (u32::from(self.start), u32::from(self.end));
1299for cp in (start..=end).filter_map(char::from_u32) {
1300for &cp_folded in folder.mapping(cp) {
1301 ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1302 }
1303 }
1304Ok(())
1305 }
1306}
13071308impl ClassUnicodeRange {
1309/// Create a new Unicode scalar value range for a character class.
1310 ///
1311 /// The returned range is always in a canonical form. That is, the range
1312 /// returned always satisfies the invariant that `start <= end`.
1313pub fn new(start: char, end: char) -> ClassUnicodeRange {
1314ClassUnicodeRange::create(start, end)
1315 }
13161317/// Return the start of this range.
1318 ///
1319 /// The start of a range is always less than or equal to the end of the
1320 /// range.
1321pub fn start(&self) -> char {
1322self.start
1323 }
13241325/// Return the end of this range.
1326 ///
1327 /// The end of a range is always greater than or equal to the start of the
1328 /// range.
1329pub fn end(&self) -> char {
1330self.end
1331 }
13321333/// Returns the number of codepoints in this range.
1334pub fn len(&self) -> usize {
1335let diff = 1 + u32::from(self.end) - u32::from(self.start);
1336// This is likely to panic in 16-bit targets since a usize can only fit
1337 // 2^16. It's not clear what to do here, other than to return an error
1338 // when building a Unicode class that contains a range whose length
1339 // overflows usize. (Which, to be honest, is probably quite common on
1340 // 16-bit targets. For example, this would imply that '.' and '\p{any}'
1341 // would be impossible to build.)
1342usize::try_from(diff).expect("char class len fits in usize")
1343 }
1344}
13451346/// A set of characters represented by arbitrary bytes.
1347///
1348/// Each byte corresponds to one character.
1349#[derive(#[automatically_derived]
impl ::core::clone::Clone for ClassBytes {
#[inline]
fn clone(&self) -> ClassBytes {
ClassBytes { set: ::core::clone::Clone::clone(&self.set) }
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for ClassBytes {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field1_finish(f, "ClassBytes",
"set", &&self.set)
}
}Debug, #[automatically_derived]
impl ::core::cmp::Eq for ClassBytes {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<IntervalSet<ClassBytesRange>>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for ClassBytes {
#[inline]
fn eq(&self, other: &ClassBytes) -> bool { self.set == other.set }
}PartialEq)]
1350pub struct ClassBytes {
1351 set: IntervalSet<ClassBytesRange>,
1352}
13531354impl ClassBytes {
1355/// Create a new class from a sequence of ranges.
1356 ///
1357 /// The given ranges do not need to be in any specific order, and ranges
1358 /// may overlap. Ranges will automatically be sorted into a canonical
1359 /// non-overlapping order.
1360pub fn new<I>(ranges: I) -> ClassBytes1361where
1362I: IntoIterator<Item = ClassBytesRange>,
1363 {
1364ClassBytes { set: IntervalSet::new(ranges) }
1365 }
13661367/// Create a new class with no ranges.
1368 ///
1369 /// An empty class matches nothing. That is, it is equivalent to
1370 /// [`Hir::fail`].
1371pub fn empty() -> ClassBytes {
1372ClassBytes::new(::alloc::vec::Vec::new()vec![])
1373 }
13741375/// Add a new range to this set.
1376pub fn push(&mut self, range: ClassBytesRange) {
1377self.set.push(range);
1378 }
13791380/// Return an iterator over all ranges in this class.
1381 ///
1382 /// The iterator yields ranges in ascending order.
1383pub fn iter(&self) -> ClassBytesIter<'_> {
1384ClassBytesIter(self.set.iter())
1385 }
13861387/// Return the underlying ranges as a slice.
1388pub fn ranges(&self) -> &[ClassBytesRange] {
1389self.set.intervals()
1390 }
13911392/// Expand this character class such that it contains all case folded
1393 /// characters. For example, if this class consists of the range `a-z`,
1394 /// then applying case folding will result in the class containing both the
1395 /// ranges `a-z` and `A-Z`.
1396 ///
1397 /// Note that this only applies ASCII case folding, which is limited to the
1398 /// characters `a-z` and `A-Z`.
1399pub fn case_fold_simple(&mut self) {
1400self.set.case_fold_simple().expect("ASCII case folding never fails");
1401 }
14021403/// Negate this byte class.
1404 ///
1405 /// For all `b` where `b` is a any byte, if `b` was in this set, then it
1406 /// will not be in this set after negation.
1407pub fn negate(&mut self) {
1408self.set.negate();
1409 }
14101411/// Union this byte class with the given byte class, in place.
1412pub fn union(&mut self, other: &ClassBytes) {
1413self.set.union(&other.set);
1414 }
14151416/// Intersect this byte class with the given byte class, in place.
1417pub fn intersect(&mut self, other: &ClassBytes) {
1418self.set.intersect(&other.set);
1419 }
14201421/// Subtract the given byte class from this byte class, in place.
1422pub fn difference(&mut self, other: &ClassBytes) {
1423self.set.difference(&other.set);
1424 }
14251426/// Compute the symmetric difference of the given byte classes, in place.
1427 ///
1428 /// This computes the symmetric difference of two byte classes. This
1429 /// removes all elements in this class that are also in the given class,
1430 /// but all adds all elements from the given class that aren't in this
1431 /// class. That is, the class will contain all elements in either class,
1432 /// but will not contain any elements that are in both classes.
1433pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1434self.set.symmetric_difference(&other.set);
1435 }
14361437/// Returns true if and only if this character class will either match
1438 /// nothing or only ASCII bytes. Stated differently, this returns false
1439 /// if and only if this class contains a non-ASCII byte.
1440pub fn is_ascii(&self) -> bool {
1441self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1442 }
14431444/// Returns the length, in bytes, of the smallest string matched by this
1445 /// character class.
1446 ///
1447 /// Returns `None` when the class is empty.
1448pub fn minimum_len(&self) -> Option<usize> {
1449if self.ranges().is_empty() {
1450None1451 } else {
1452Some(1)
1453 }
1454 }
14551456/// Returns the length, in bytes, of the longest string matched by this
1457 /// character class.
1458 ///
1459 /// Returns `None` when the class is empty.
1460pub fn maximum_len(&self) -> Option<usize> {
1461if self.ranges().is_empty() {
1462None1463 } else {
1464Some(1)
1465 }
1466 }
14671468/// If this class consists of exactly one byte, then return it as
1469 /// a literal byte string.
1470 ///
1471 /// If this class is empty or contains more than one byte, then `None`
1472 /// is returned.
1473pub fn literal(&self) -> Option<Vec<u8>> {
1474let rs = self.ranges();
1475if rs.len() == 1 && rs[0].start == rs[0].end {
1476Some(::alloc::boxed::box_assume_init_into_vec_unsafe(::alloc::intrinsics::write_box_via_move(::alloc::boxed::Box::new_uninit(),
[rs[0].start]))vec![rs[0].start])
1477 } else {
1478None1479 }
1480 }
14811482/// If this class consists of only ASCII ranges, then return its
1483 /// corresponding and equivalent Unicode class.
1484pub fn to_unicode_class(&self) -> Option<ClassUnicode> {
1485if !self.is_ascii() {
1486return None;
1487 }
1488Some(ClassUnicode::new(self.ranges().iter().map(|r| {
1489// Since we are guaranteed that our byte range is ASCII, the
1490 // 'char::from' calls below are correct and will not erroneously
1491 // convert a raw byte value into its corresponding codepoint.
1492ClassUnicodeRange {
1493 start: char::from(r.start),
1494 end: char::from(r.end),
1495 }
1496 })))
1497 }
1498}
14991500/// An iterator over all ranges in a byte character class.
1501///
1502/// The lifetime `'a` refers to the lifetime of the underlying class.
1503#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for ClassBytesIter<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_tuple_field1_finish(f, "ClassBytesIter",
&&self.0)
}
}Debug)]
1504pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
15051506impl<'a> Iteratorfor ClassBytesIter<'a> {
1507type Item = &'a ClassBytesRange;
15081509fn next(&mut self) -> Option<&'a ClassBytesRange> {
1510self.0.next()
1511 }
1512}
15131514/// A single range of characters represented by arbitrary bytes.
1515///
1516/// The range is closed. That is, the start and end of the range are included
1517/// in the range.
1518#[derive(#[automatically_derived]
impl ::core::clone::Clone for ClassBytesRange {
#[inline]
fn clone(&self) -> ClassBytesRange {
let _: ::core::clone::AssertParamIsClone<u8>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for ClassBytesRange { }Copy, #[automatically_derived]
impl ::core::default::Default for ClassBytesRange {
#[inline]
fn default() -> ClassBytesRange {
ClassBytesRange {
start: ::core::default::Default::default(),
end: ::core::default::Default::default(),
}
}
}Default, #[automatically_derived]
impl ::core::cmp::Eq for ClassBytesRange {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<u8>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for ClassBytesRange {
#[inline]
fn eq(&self, other: &ClassBytesRange) -> bool {
self.start == other.start && self.end == other.end
}
}PartialEq, #[automatically_derived]
impl ::core::cmp::PartialOrd for ClassBytesRange {
#[inline]
fn partial_cmp(&self, other: &ClassBytesRange)
-> ::core::option::Option<::core::cmp::Ordering> {
match ::core::cmp::PartialOrd::partial_cmp(&self.start, &other.start)
{
::core::option::Option::Some(::core::cmp::Ordering::Equal) =>
::core::cmp::PartialOrd::partial_cmp(&self.end, &other.end),
cmp => cmp,
}
}
}PartialOrd, #[automatically_derived]
impl ::core::cmp::Ord for ClassBytesRange {
#[inline]
fn cmp(&self, other: &ClassBytesRange) -> ::core::cmp::Ordering {
match ::core::cmp::Ord::cmp(&self.start, &other.start) {
::core::cmp::Ordering::Equal =>
::core::cmp::Ord::cmp(&self.end, &other.end),
cmp => cmp,
}
}
}Ord)]
1519pub struct ClassBytesRange {
1520 start: u8,
1521 end: u8,
1522}
15231524impl Intervalfor ClassBytesRange {
1525type Bound = u8;
15261527#[inline]
1528fn lower(&self) -> u8 {
1529self.start
1530 }
1531#[inline]
1532fn upper(&self) -> u8 {
1533self.end
1534 }
1535#[inline]
1536fn set_lower(&mut self, bound: u8) {
1537self.start = bound;
1538 }
1539#[inline]
1540fn set_upper(&mut self, bound: u8) {
1541self.end = bound;
1542 }
15431544/// Apply simple case folding to this byte range. Only ASCII case mappings
1545 /// (for a-z) are applied.
1546 ///
1547 /// Additional ranges are appended to the given vector. Canonical ordering
1548 /// is *not* maintained in the given vector.
1549fn case_fold_simple(
1550&self,
1551 ranges: &mut Vec<ClassBytesRange>,
1552 ) -> Result<(), unicode::CaseFoldError> {
1553if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1554let lower = cmp::max(self.start, b'a');
1555let upper = cmp::min(self.end, b'z');
1556ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1557 }
1558if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1559let lower = cmp::max(self.start, b'A');
1560let upper = cmp::min(self.end, b'Z');
1561ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1562 }
1563Ok(())
1564 }
1565}
15661567impl ClassBytesRange {
1568/// Create a new byte range for a character class.
1569 ///
1570 /// The returned range is always in a canonical form. That is, the range
1571 /// returned always satisfies the invariant that `start <= end`.
1572pub fn new(start: u8, end: u8) -> ClassBytesRange {
1573ClassBytesRange::create(start, end)
1574 }
15751576/// Return the start of this range.
1577 ///
1578 /// The start of a range is always less than or equal to the end of the
1579 /// range.
1580pub fn start(&self) -> u8 {
1581self.start
1582 }
15831584/// Return the end of this range.
1585 ///
1586 /// The end of a range is always greater than or equal to the start of the
1587 /// range.
1588pub fn end(&self) -> u8 {
1589self.end
1590 }
15911592/// Returns the number of bytes in this range.
1593pub fn len(&self) -> usize {
1594usize::from(self.end.checked_sub(self.start).unwrap())
1595 .checked_add(1)
1596 .unwrap()
1597 }
1598}
15991600impl core::fmt::Debugfor ClassBytesRange {
1601fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1602f.debug_struct("ClassBytesRange")
1603 .field("start", &crate::debug::Byte(self.start))
1604 .field("end", &crate::debug::Byte(self.end))
1605 .finish()
1606 }
1607}
16081609/// The high-level intermediate representation for a look-around assertion.
1610///
1611/// An assertion match is always zero-length. Also called an "empty match."
1612#[derive(#[automatically_derived]
impl ::core::clone::Clone for Look {
#[inline]
fn clone(&self) -> Look { *self }
}Clone, #[automatically_derived]
impl ::core::marker::Copy for Look { }Copy, #[automatically_derived]
impl ::core::fmt::Debug for Look {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::write_str(f,
match self {
Look::Start => "Start",
Look::End => "End",
Look::StartLF => "StartLF",
Look::EndLF => "EndLF",
Look::StartCRLF => "StartCRLF",
Look::EndCRLF => "EndCRLF",
Look::WordAscii => "WordAscii",
Look::WordAsciiNegate => "WordAsciiNegate",
Look::WordUnicode => "WordUnicode",
Look::WordUnicodeNegate => "WordUnicodeNegate",
Look::WordStartAscii => "WordStartAscii",
Look::WordEndAscii => "WordEndAscii",
Look::WordStartUnicode => "WordStartUnicode",
Look::WordEndUnicode => "WordEndUnicode",
Look::WordStartHalfAscii => "WordStartHalfAscii",
Look::WordEndHalfAscii => "WordEndHalfAscii",
Look::WordStartHalfUnicode => "WordStartHalfUnicode",
Look::WordEndHalfUnicode => "WordEndHalfUnicode",
})
}
}Debug, #[automatically_derived]
impl ::core::cmp::Eq for Look {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for Look {
#[inline]
fn eq(&self, other: &Look) -> bool {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
__self_discr == __arg1_discr
}
}PartialEq)]
1613pub enum Look {
1614/// Match the beginning of text. Specifically, this matches at the starting
1615 /// position of the input.
1616Start = 1 << 0,
1617/// Match the end of text. Specifically, this matches at the ending
1618 /// position of the input.
1619End = 1 << 1,
1620/// Match the beginning of a line or the beginning of text. Specifically,
1621 /// this matches at the starting position of the input, or at the position
1622 /// immediately following a `\n` character.
1623StartLF = 1 << 2,
1624/// Match the end of a line or the end of text. Specifically, this matches
1625 /// at the end position of the input, or at the position immediately
1626 /// preceding a `\n` character.
1627EndLF = 1 << 3,
1628/// Match the beginning of a line or the beginning of text. Specifically,
1629 /// this matches at the starting position of the input, or at the position
1630 /// immediately following either a `\r` or `\n` character, but never after
1631 /// a `\r` when a `\n` follows.
1632StartCRLF = 1 << 4,
1633/// Match the end of a line or the end of text. Specifically, this matches
1634 /// at the end position of the input, or at the position immediately
1635 /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
1636 /// precedes it.
1637EndCRLF = 1 << 5,
1638/// Match an ASCII-only word boundary. That is, this matches a position
1639 /// where the left adjacent character and right adjacent character
1640 /// correspond to a word and non-word or a non-word and word character.
1641WordAscii = 1 << 6,
1642/// Match an ASCII-only negation of a word boundary.
1643WordAsciiNegate = 1 << 7,
1644/// Match a Unicode-aware word boundary. That is, this matches a position
1645 /// where the left adjacent character and right adjacent character
1646 /// correspond to a word and non-word or a non-word and word character.
1647WordUnicode = 1 << 8,
1648/// Match a Unicode-aware negation of a word boundary.
1649WordUnicodeNegate = 1 << 9,
1650/// Match the start of an ASCII-only word boundary. That is, this matches a
1651 /// position at either the beginning of the haystack or where the previous
1652 /// character is not a word character and the following character is a word
1653 /// character.
1654WordStartAscii = 1 << 10,
1655/// Match the end of an ASCII-only word boundary. That is, this matches
1656 /// a position at either the end of the haystack or where the previous
1657 /// character is a word character and the following character is not a word
1658 /// character.
1659WordEndAscii = 1 << 11,
1660/// Match the start of a Unicode word boundary. That is, this matches a
1661 /// position at either the beginning of the haystack or where the previous
1662 /// character is not a word character and the following character is a word
1663 /// character.
1664WordStartUnicode = 1 << 12,
1665/// Match the end of a Unicode word boundary. That is, this matches a
1666 /// position at either the end of the haystack or where the previous
1667 /// character is a word character and the following character is not a word
1668 /// character.
1669WordEndUnicode = 1 << 13,
1670/// Match the start half of an ASCII-only word boundary. That is, this
1671 /// matches a position at either the beginning of the haystack or where the
1672 /// previous character is not a word character.
1673WordStartHalfAscii = 1 << 14,
1674/// Match the end half of an ASCII-only word boundary. That is, this
1675 /// matches a position at either the end of the haystack or where the
1676 /// following character is not a word character.
1677WordEndHalfAscii = 1 << 15,
1678/// Match the start half of a Unicode word boundary. That is, this matches
1679 /// a position at either the beginning of the haystack or where the
1680 /// previous character is not a word character.
1681WordStartHalfUnicode = 1 << 16,
1682/// Match the end half of a Unicode word boundary. That is, this matches
1683 /// a position at either the end of the haystack or where the following
1684 /// character is not a word character.
1685WordEndHalfUnicode = 1 << 17,
1686}
16871688impl Look {
1689/// Flip the look-around assertion to its equivalent for reverse searches.
1690 /// For example, `StartLF` gets translated to `EndLF`.
1691 ///
1692 /// Some assertions, such as `WordUnicode`, remain the same since they
1693 /// match the same positions regardless of the direction of the search.
1694#[inline]
1695pub const fn reversed(self) -> Look {
1696match self {
1697 Look::Start => Look::End,
1698 Look::End => Look::Start,
1699 Look::StartLF => Look::EndLF,
1700 Look::EndLF => Look::StartLF,
1701 Look::StartCRLF => Look::EndCRLF,
1702 Look::EndCRLF => Look::StartCRLF,
1703 Look::WordAscii => Look::WordAscii,
1704 Look::WordAsciiNegate => Look::WordAsciiNegate,
1705 Look::WordUnicode => Look::WordUnicode,
1706 Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1707 Look::WordStartAscii => Look::WordEndAscii,
1708 Look::WordEndAscii => Look::WordStartAscii,
1709 Look::WordStartUnicode => Look::WordEndUnicode,
1710 Look::WordEndUnicode => Look::WordStartUnicode,
1711 Look::WordStartHalfAscii => Look::WordEndHalfAscii,
1712 Look::WordEndHalfAscii => Look::WordStartHalfAscii,
1713 Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
1714 Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
1715 }
1716 }
17171718/// Return the underlying representation of this look-around enumeration
1719 /// as an integer. Giving the return value to the [`Look::from_repr`]
1720 /// constructor is guaranteed to return the same look-around variant that
1721 /// one started with within a semver compatible release of this crate.
1722#[inline]
1723pub const fn as_repr(self) -> u32 {
1724// AFAIK, 'as' is the only way to zero-cost convert an int enum to an
1725 // actual int.
1726selfas u321727 }
17281729/// Given the underlying representation of a `Look` value, return the
1730 /// corresponding `Look` value if the representation is valid. Otherwise
1731 /// `None` is returned.
1732#[inline]
1733pub const fn from_repr(repr: u32) -> Option<Look> {
1734match repr {
17350b00_0000_0000_0000_0001 => Some(Look::Start),
17360b00_0000_0000_0000_0010 => Some(Look::End),
17370b00_0000_0000_0000_0100 => Some(Look::StartLF),
17380b00_0000_0000_0000_1000 => Some(Look::EndLF),
17390b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
17400b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
17410b00_0000_0000_0100_0000 => Some(Look::WordAscii),
17420b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
17430b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
17440b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
17450b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
17460b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
17470b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
17480b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
17490b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
17500b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
17510b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
17520b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
1753_ => None,
1754 }
1755 }
17561757/// Returns a convenient single codepoint representation of this
1758 /// look-around assertion. Each assertion is guaranteed to be represented
1759 /// by a distinct character.
1760 ///
1761 /// This is useful for succinctly representing a look-around assertion in
1762 /// human friendly but succinct output intended for a programmer working on
1763 /// regex internals.
1764#[inline]
1765pub const fn as_char(self) -> char {
1766match self {
1767 Look::Start => 'A',
1768 Look::End => 'z',
1769 Look::StartLF => '^',
1770 Look::EndLF => '$',
1771 Look::StartCRLF => 'r',
1772 Look::EndCRLF => 'R',
1773 Look::WordAscii => 'b',
1774 Look::WordAsciiNegate => 'B',
1775 Look::WordUnicode => '𝛃',
1776 Look::WordUnicodeNegate => '𝚩',
1777 Look::WordStartAscii => '<',
1778 Look::WordEndAscii => '>',
1779 Look::WordStartUnicode => '〈',
1780 Look::WordEndUnicode => '〉',
1781 Look::WordStartHalfAscii => '◁',
1782 Look::WordEndHalfAscii => '▷',
1783 Look::WordStartHalfUnicode => '◀',
1784 Look::WordEndHalfUnicode => '▶',
1785 }
1786 }
1787}
17881789/// The high-level intermediate representation for a capturing group.
1790///
1791/// A capturing group always has an index and a child expression. It may
1792/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
1793/// necessary.
1794///
1795/// Note that there is no explicit representation of a non-capturing group
1796/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
1797/// the recursive structure of the `Hir` itself.
1798#[derive(#[automatically_derived]
impl ::core::clone::Clone for Capture {
#[inline]
fn clone(&self) -> Capture {
Capture {
index: ::core::clone::Clone::clone(&self.index),
name: ::core::clone::Clone::clone(&self.name),
sub: ::core::clone::Clone::clone(&self.sub),
}
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for Capture {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field3_finish(f, "Capture",
"index", &self.index, "name", &self.name, "sub", &&self.sub)
}
}Debug, #[automatically_derived]
impl ::core::cmp::Eq for Capture {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<u32>;
let _: ::core::cmp::AssertParamIsEq<Option<Box<str>>>;
let _: ::core::cmp::AssertParamIsEq<Box<Hir>>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for Capture {
#[inline]
fn eq(&self, other: &Capture) -> bool {
self.index == other.index && self.name == other.name &&
self.sub == other.sub
}
}PartialEq)]
1799pub struct Capture {
1800/// The capture index of the capture.
1801pub index: u32,
1802/// The name of the capture, if it exists.
1803pub name: Option<Box<str>>,
1804/// The expression inside the capturing group, which may be empty.
1805pub sub: Box<Hir>,
1806}
18071808/// The high-level intermediate representation of a repetition operator.
1809///
1810/// A repetition operator permits the repetition of an arbitrary
1811/// sub-expression.
1812#[derive(#[automatically_derived]
impl ::core::clone::Clone for Repetition {
#[inline]
fn clone(&self) -> Repetition {
Repetition {
min: ::core::clone::Clone::clone(&self.min),
max: ::core::clone::Clone::clone(&self.max),
greedy: ::core::clone::Clone::clone(&self.greedy),
sub: ::core::clone::Clone::clone(&self.sub),
}
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for Repetition {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field4_finish(f, "Repetition",
"min", &self.min, "max", &self.max, "greedy", &self.greedy, "sub",
&&self.sub)
}
}Debug, #[automatically_derived]
impl ::core::cmp::Eq for Repetition {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<u32>;
let _: ::core::cmp::AssertParamIsEq<Option<u32>>;
let _: ::core::cmp::AssertParamIsEq<bool>;
let _: ::core::cmp::AssertParamIsEq<Box<Hir>>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for Repetition {
#[inline]
fn eq(&self, other: &Repetition) -> bool {
self.min == other.min && self.greedy == other.greedy &&
self.max == other.max && self.sub == other.sub
}
}PartialEq)]
1813pub struct Repetition {
1814/// The minimum range of the repetition.
1815 ///
1816 /// Note that special cases like `?`, `+` and `*` all get translated into
1817 /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
1818 ///
1819 /// When `min` is zero, this expression can match the empty string
1820 /// regardless of what its sub-expression is.
1821pub min: u32,
1822/// The maximum range of the repetition.
1823 ///
1824 /// Note that when `max` is `None`, `min` acts as a lower bound but where
1825 /// there is no upper bound. For something like `x{5}` where the min and
1826 /// max are equivalent, `min` will be set to `5` and `max` will be set to
1827 /// `Some(5)`.
1828pub max: Option<u32>,
1829/// Whether this repetition operator is greedy or not. A greedy operator
1830 /// will match as much as it can. A non-greedy operator will match as
1831 /// little as it can.
1832 ///
1833 /// Typically, operators are greedy by default and are only non-greedy when
1834 /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1835 /// not. However, this can be inverted via the `U` "ungreedy" flag.
1836pub greedy: bool,
1837/// The expression being repeated.
1838pub sub: Box<Hir>,
1839}
18401841impl Repetition {
1842/// Returns a new repetition with the same `min`, `max` and `greedy`
1843 /// values, but with its sub-expression replaced with the one given.
1844pub fn with(&self, sub: Hir) -> Repetition {
1845Repetition {
1846 min: self.min,
1847 max: self.max,
1848 greedy: self.greedy,
1849 sub: Box::new(sub),
1850 }
1851 }
1852}
18531854/// A type describing the different flavors of `.`.
1855///
1856/// This type is meant to be used with [`Hir::dot`], which is a convenience
1857/// routine for building HIR values derived from the `.` regex.
1858#[non_exhaustive]
1859#[derive(#[automatically_derived]
impl ::core::clone::Clone for Dot {
#[inline]
fn clone(&self) -> Dot {
let _: ::core::clone::AssertParamIsClone<char>;
let _: ::core::clone::AssertParamIsClone<u8>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for Dot { }Copy, #[automatically_derived]
impl ::core::fmt::Debug for Dot {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
match self {
Dot::AnyChar => ::core::fmt::Formatter::write_str(f, "AnyChar"),
Dot::AnyByte => ::core::fmt::Formatter::write_str(f, "AnyByte"),
Dot::AnyCharExcept(__self_0) =>
::core::fmt::Formatter::debug_tuple_field1_finish(f,
"AnyCharExcept", &__self_0),
Dot::AnyCharExceptLF =>
::core::fmt::Formatter::write_str(f, "AnyCharExceptLF"),
Dot::AnyCharExceptCRLF =>
::core::fmt::Formatter::write_str(f, "AnyCharExceptCRLF"),
Dot::AnyByteExcept(__self_0) =>
::core::fmt::Formatter::debug_tuple_field1_finish(f,
"AnyByteExcept", &__self_0),
Dot::AnyByteExceptLF =>
::core::fmt::Formatter::write_str(f, "AnyByteExceptLF"),
Dot::AnyByteExceptCRLF =>
::core::fmt::Formatter::write_str(f, "AnyByteExceptCRLF"),
}
}
}Debug, #[automatically_derived]
impl ::core::cmp::Eq for Dot {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<char>;
let _: ::core::cmp::AssertParamIsEq<u8>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for Dot {
#[inline]
fn eq(&self, other: &Dot) -> bool {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
__self_discr == __arg1_discr &&
match (self, other) {
(Dot::AnyCharExcept(__self_0), Dot::AnyCharExcept(__arg1_0))
=> __self_0 == __arg1_0,
(Dot::AnyByteExcept(__self_0), Dot::AnyByteExcept(__arg1_0))
=> __self_0 == __arg1_0,
_ => true,
}
}
}PartialEq)]
1860pub enum Dot {
1861/// Matches the UTF-8 encoding of any Unicode scalar value.
1862 ///
1863 /// This is equivalent to `(?su:.)` and also `\p{any}`.
1864AnyChar,
1865/// Matches any byte value.
1866 ///
1867 /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
1868AnyByte,
1869/// Matches the UTF-8 encoding of any Unicode scalar value except for the
1870 /// `char` given.
1871 ///
1872 /// This is equivalent to using `(?u-s:.)` with the line terminator set
1873 /// to a particular ASCII byte. (Because of peculiarities in the regex
1874 /// engines, a line terminator must be a single byte. It follows that when
1875 /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1876 /// value. That is, ti must be ASCII.)
1877 ///
1878 /// (This and `AnyCharExceptLF` both exist because of legacy reasons.
1879 /// `AnyCharExceptLF` will be dropped in the next breaking change release.)
1880AnyCharExcept(char),
1881/// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
1882 ///
1883 /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
1884AnyCharExceptLF,
1885/// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
1886 /// and `\n`.
1887 ///
1888 /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
1889AnyCharExceptCRLF,
1890/// Matches any byte value except for the `u8` given.
1891 ///
1892 /// This is equivalent to using `(?-us:.)` with the line terminator set
1893 /// to a particular ASCII byte. (Because of peculiarities in the regex
1894 /// engines, a line terminator must be a single byte. It follows that when
1895 /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1896 /// value. That is, ti must be ASCII.)
1897 ///
1898 /// (This and `AnyByteExceptLF` both exist because of legacy reasons.
1899 /// `AnyByteExceptLF` will be dropped in the next breaking change release.)
1900AnyByteExcept(u8),
1901/// Matches any byte value except for `\n`.
1902 ///
1903 /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
1904AnyByteExceptLF,
1905/// Matches any byte value except for `\r` and `\n`.
1906 ///
1907 /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
1908AnyByteExceptCRLF,
1909}
19101911/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1912/// space but heap space proportional to the depth of the total `Hir`.
1913impl Dropfor Hir {
1914fn drop(&mut self) {
1915use core::mem;
19161917match *self.kind() {
1918 HirKind::Empty1919 | HirKind::Literal(_)
1920 | HirKind::Class(_)
1921 | HirKind::Look(_) => return,
1922 HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
1923 HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
1924return
1925}
1926 HirKind::Concat(ref x) if x.is_empty() => return,
1927 HirKind::Alternation(ref x) if x.is_empty() => return,
1928_ => {}
1929 }
19301931let mut stack = ::alloc::boxed::box_assume_init_into_vec_unsafe(::alloc::intrinsics::write_box_via_move(::alloc::boxed::Box::new_uninit(),
[mem::replace(self, Hir::empty())]))vec![mem::replace(self, Hir::empty())];
1932while let Some(mut expr) = stack.pop() {
1933match expr.kind {
1934 HirKind::Empty
1935 | HirKind::Literal(_)
1936 | HirKind::Class(_)
1937 | HirKind::Look(_) => {}
1938 HirKind::Capture(ref mut x) => {
1939 stack.push(mem::replace(&mut x.sub, Hir::empty()));
1940 }
1941 HirKind::Repetition(ref mut x) => {
1942 stack.push(mem::replace(&mut x.sub, Hir::empty()));
1943 }
1944 HirKind::Concat(ref mut x) => {
1945 stack.extend(x.drain(..));
1946 }
1947 HirKind::Alternation(ref mut x) => {
1948 stack.extend(x.drain(..));
1949 }
1950 }
1951 }
1952 }
1953}
19541955/// A type that collects various properties of an HIR value.
1956///
1957/// Properties are always scalar values and represent meta data that is
1958/// computed inductively on an HIR value. Properties are defined for all
1959/// HIR values.
1960///
1961/// All methods on a `Properties` value take constant time and are meant to
1962/// be cheap to call.
1963#[derive(#[automatically_derived]
impl ::core::clone::Clone for Properties {
#[inline]
fn clone(&self) -> Properties {
Properties(::core::clone::Clone::clone(&self.0))
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for Properties {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_tuple_field1_finish(f, "Properties",
&&self.0)
}
}Debug, #[automatically_derived]
impl ::core::cmp::Eq for Properties {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<Box<PropertiesI>>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for Properties {
#[inline]
fn eq(&self, other: &Properties) -> bool { self.0 == other.0 }
}PartialEq)]
1964pub struct Properties(Box<PropertiesI>);
19651966/// The property definition. It is split out so that we can box it, and
1967/// there by make `Properties` use less stack size. This is kind-of important
1968/// because every HIR value has a `Properties` attached to it.
1969///
1970/// This does have the unfortunate consequence that creating any HIR value
1971/// always leads to at least one alloc for properties, but this is generally
1972/// true anyway (for pretty much all HirKinds except for look-arounds).
1973#[derive(#[automatically_derived]
impl ::core::clone::Clone for PropertiesI {
#[inline]
fn clone(&self) -> PropertiesI {
PropertiesI {
minimum_len: ::core::clone::Clone::clone(&self.minimum_len),
maximum_len: ::core::clone::Clone::clone(&self.maximum_len),
look_set: ::core::clone::Clone::clone(&self.look_set),
look_set_prefix: ::core::clone::Clone::clone(&self.look_set_prefix),
look_set_suffix: ::core::clone::Clone::clone(&self.look_set_suffix),
look_set_prefix_any: ::core::clone::Clone::clone(&self.look_set_prefix_any),
look_set_suffix_any: ::core::clone::Clone::clone(&self.look_set_suffix_any),
utf8: ::core::clone::Clone::clone(&self.utf8),
explicit_captures_len: ::core::clone::Clone::clone(&self.explicit_captures_len),
static_explicit_captures_len: ::core::clone::Clone::clone(&self.static_explicit_captures_len),
literal: ::core::clone::Clone::clone(&self.literal),
alternation_literal: ::core::clone::Clone::clone(&self.alternation_literal),
}
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for PropertiesI {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
let names: &'static _ =
&["minimum_len", "maximum_len", "look_set", "look_set_prefix",
"look_set_suffix", "look_set_prefix_any",
"look_set_suffix_any", "utf8", "explicit_captures_len",
"static_explicit_captures_len", "literal",
"alternation_literal"];
let values: &[&dyn ::core::fmt::Debug] =
&[&self.minimum_len, &self.maximum_len, &self.look_set,
&self.look_set_prefix, &self.look_set_suffix,
&self.look_set_prefix_any, &self.look_set_suffix_any,
&self.utf8, &self.explicit_captures_len,
&self.static_explicit_captures_len, &self.literal,
&&self.alternation_literal];
::core::fmt::Formatter::debug_struct_fields_finish(f, "PropertiesI",
names, values)
}
}Debug, #[automatically_derived]
impl ::core::cmp::Eq for PropertiesI {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<Option<usize>>;
let _: ::core::cmp::AssertParamIsEq<Option<usize>>;
let _: ::core::cmp::AssertParamIsEq<LookSet>;
let _: ::core::cmp::AssertParamIsEq<bool>;
let _: ::core::cmp::AssertParamIsEq<usize>;
let _: ::core::cmp::AssertParamIsEq<Option<usize>>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for PropertiesI {
#[inline]
fn eq(&self, other: &PropertiesI) -> bool {
self.utf8 == other.utf8 && self.literal == other.literal &&
self.alternation_literal == other.alternation_literal &&
self.minimum_len == other.minimum_len &&
self.maximum_len == other.maximum_len &&
self.look_set == other.look_set &&
self.look_set_prefix == other.look_set_prefix &&
self.look_set_suffix == other.look_set_suffix &&
self.look_set_prefix_any == other.look_set_prefix_any &&
self.look_set_suffix_any == other.look_set_suffix_any &&
self.explicit_captures_len == other.explicit_captures_len &&
self.static_explicit_captures_len ==
other.static_explicit_captures_len
}
}PartialEq)]
1974struct PropertiesI {
1975 minimum_len: Option<usize>,
1976 maximum_len: Option<usize>,
1977 look_set: LookSet,
1978 look_set_prefix: LookSet,
1979 look_set_suffix: LookSet,
1980 look_set_prefix_any: LookSet,
1981 look_set_suffix_any: LookSet,
1982 utf8: bool,
1983 explicit_captures_len: usize,
1984 static_explicit_captures_len: Option<usize>,
1985 literal: bool,
1986 alternation_literal: bool,
1987}
19881989impl Properties {
1990/// Returns the length (in bytes) of the smallest string matched by this
1991 /// HIR.
1992 ///
1993 /// A return value of `0` is possible and occurs when the HIR can match an
1994 /// empty string.
1995 ///
1996 /// `None` is returned when there is no minimum length. This occurs in
1997 /// precisely the cases where the HIR matches nothing. i.e., The language
1998 /// the regex matches is empty. An example of such a regex is `\P{any}`.
1999#[inline]
2000pub fn minimum_len(&self) -> Option<usize> {
2001self.0.minimum_len
2002 }
20032004/// Returns the length (in bytes) of the longest string matched by this
2005 /// HIR.
2006 ///
2007 /// A return value of `0` is possible and occurs when nothing longer than
2008 /// the empty string is in the language described by this HIR.
2009 ///
2010 /// `None` is returned when there is no longest matching string. This
2011 /// occurs when the HIR matches nothing or when there is no upper bound on
2012 /// the length of matching strings. Example of such regexes are `\P{any}`
2013 /// (matches nothing) and `a+` (has no upper bound).
2014#[inline]
2015pub fn maximum_len(&self) -> Option<usize> {
2016self.0.maximum_len
2017 }
20182019/// Returns a set of all look-around assertions that appear at least once
2020 /// in this HIR value.
2021#[inline]
2022pub fn look_set(&self) -> LookSet {
2023self.0.look_set
2024 }
20252026/// Returns a set of all look-around assertions that appear as a prefix for
2027 /// this HIR value. That is, the set returned corresponds to the set of
2028 /// assertions that must be passed before matching any bytes in a haystack.
2029 ///
2030 /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true
2031 /// if and only if the HIR is fully anchored at the start.
2032#[inline]
2033pub fn look_set_prefix(&self) -> LookSet {
2034self.0.look_set_prefix
2035 }
20362037/// Returns a set of all look-around assertions that appear as a _possible_
2038 /// prefix for this HIR value. That is, the set returned corresponds to the
2039 /// set of assertions that _may_ be passed before matching any bytes in a
2040 /// haystack.
2041 ///
2042 /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
2043 /// true if and only if it's possible for the regex to match through a
2044 /// anchored assertion before consuming any input.
2045#[inline]
2046pub fn look_set_prefix_any(&self) -> LookSet {
2047self.0.look_set_prefix_any
2048 }
20492050/// Returns a set of all look-around assertions that appear as a suffix for
2051 /// this HIR value. That is, the set returned corresponds to the set of
2052 /// assertions that must be passed in order to be considered a match after
2053 /// all other consuming HIR expressions.
2054 ///
2055 /// For example, `hir.look_set_suffix().contains(Look::End)` returns true
2056 /// if and only if the HIR is fully anchored at the end.
2057#[inline]
2058pub fn look_set_suffix(&self) -> LookSet {
2059self.0.look_set_suffix
2060 }
20612062/// Returns a set of all look-around assertions that appear as a _possible_
2063 /// suffix for this HIR value. That is, the set returned corresponds to the
2064 /// set of assertions that _may_ be passed before matching any bytes in a
2065 /// haystack.
2066 ///
2067 /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
2068 /// true if and only if it's possible for the regex to match through a
2069 /// anchored assertion at the end of a match without consuming any input.
2070#[inline]
2071pub fn look_set_suffix_any(&self) -> LookSet {
2072self.0.look_set_suffix_any
2073 }
20742075/// Return true if and only if the corresponding HIR will always match
2076 /// valid UTF-8.
2077 ///
2078 /// When this returns false, then it is possible for this HIR expression to
2079 /// match invalid UTF-8, including by matching between the code units of
2080 /// a single UTF-8 encoded codepoint.
2081 ///
2082 /// Note that this returns true even when the corresponding HIR can match
2083 /// the empty string. Since an empty string can technically appear between
2084 /// UTF-8 code units, it is possible for a match to be reported that splits
2085 /// a codepoint which could in turn be considered matching invalid UTF-8.
2086 /// However, it is generally assumed that such empty matches are handled
2087 /// specially by the search routine if it is absolutely required that
2088 /// matches not split a codepoint.
2089 ///
2090 /// # Example
2091 ///
2092 /// This code example shows the UTF-8 property of a variety of patterns.
2093 ///
2094 /// ```
2095 /// use regex_syntax::{ParserBuilder, parse};
2096 ///
2097 /// // Examples of 'is_utf8() == true'.
2098 /// assert!(parse(r"a")?.properties().is_utf8());
2099 /// assert!(parse(r"[^a]")?.properties().is_utf8());
2100 /// assert!(parse(r".")?.properties().is_utf8());
2101 /// assert!(parse(r"\W")?.properties().is_utf8());
2102 /// assert!(parse(r"\b")?.properties().is_utf8());
2103 /// assert!(parse(r"\B")?.properties().is_utf8());
2104 /// assert!(parse(r"(?-u)\b")?.properties().is_utf8());
2105 /// assert!(parse(r"(?-u)\B")?.properties().is_utf8());
2106 /// // Unicode mode is enabled by default, and in
2107 /// // that mode, all \x hex escapes are treated as
2108 /// // codepoints. So this actually matches the UTF-8
2109 /// // encoding of U+00FF.
2110 /// assert!(parse(r"\xFF")?.properties().is_utf8());
2111 ///
2112 /// // Now we show examples of 'is_utf8() == false'.
2113 /// // The only way to do this is to force the parser
2114 /// // to permit invalid UTF-8, otherwise all of these
2115 /// // would fail to parse!
2116 /// let parse = |pattern| {
2117 /// ParserBuilder::new().utf8(false).build().parse(pattern)
2118 /// };
2119 /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8());
2120 /// assert!(!parse(r"(?-u).")?.properties().is_utf8());
2121 /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8());
2122 /// // Conversely to the equivalent example above,
2123 /// // when Unicode mode is disabled, \x hex escapes
2124 /// // are treated as their raw byte values.
2125 /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8());
2126 /// // Note that just because we disabled UTF-8 in the
2127 /// // parser doesn't mean we still can't use Unicode.
2128 /// // It is enabled by default, so \xFF is still
2129 /// // equivalent to matching the UTF-8 encoding of
2130 /// // U+00FF by default.
2131 /// assert!(parse(r"\xFF")?.properties().is_utf8());
2132 /// // Even though we use raw bytes that individually
2133 /// // are not valid UTF-8, when combined together, the
2134 /// // overall expression *does* match valid UTF-8!
2135 /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8());
2136 ///
2137 /// # Ok::<(), Box<dyn std::error::Error>>(())
2138 /// ```
2139#[inline]
2140pub fn is_utf8(&self) -> bool {
2141self.0.utf8
2142 }
21432144/// Returns the total number of explicit capturing groups in the
2145 /// corresponding HIR.
2146 ///
2147 /// Note that this does not include the implicit capturing group
2148 /// corresponding to the entire match that is typically included by regex
2149 /// engines.
2150 ///
2151 /// # Example
2152 ///
2153 /// This method will return `0` for `a` and `1` for `(a)`:
2154 ///
2155 /// ```
2156 /// use regex_syntax::parse;
2157 ///
2158 /// assert_eq!(0, parse("a")?.properties().explicit_captures_len());
2159 /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len());
2160 ///
2161 /// # Ok::<(), Box<dyn std::error::Error>>(())
2162 /// ```
2163#[inline]
2164pub fn explicit_captures_len(&self) -> usize {
2165self.0.explicit_captures_len
2166 }
21672168/// Returns the total number of explicit capturing groups that appear in
2169 /// every possible match.
2170 ///
2171 /// If the number of capture groups can vary depending on the match, then
2172 /// this returns `None`. That is, a value is only returned when the number
2173 /// of matching groups is invariant or "static."
2174 ///
2175 /// Note that this does not include the implicit capturing group
2176 /// corresponding to the entire match.
2177 ///
2178 /// # Example
2179 ///
2180 /// This shows a few cases where a static number of capture groups is
2181 /// available and a few cases where it is not.
2182 ///
2183 /// ```
2184 /// use regex_syntax::parse;
2185 ///
2186 /// let len = |pattern| {
2187 /// parse(pattern).map(|h| {
2188 /// h.properties().static_explicit_captures_len()
2189 /// })
2190 /// };
2191 ///
2192 /// assert_eq!(Some(0), len("a")?);
2193 /// assert_eq!(Some(1), len("(a)")?);
2194 /// assert_eq!(Some(1), len("(a)|(b)")?);
2195 /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?);
2196 /// assert_eq!(None, len("(a)|b")?);
2197 /// assert_eq!(None, len("a|(b)")?);
2198 /// assert_eq!(None, len("(b)*")?);
2199 /// assert_eq!(Some(1), len("(b)+")?);
2200 ///
2201 /// # Ok::<(), Box<dyn std::error::Error>>(())
2202 /// ```
2203#[inline]
2204pub fn static_explicit_captures_len(&self) -> Option<usize> {
2205self.0.static_explicit_captures_len
2206 }
22072208/// Return true if and only if this HIR is a simple literal. This is
2209 /// only true when this HIR expression is either itself a `Literal` or a
2210 /// concatenation of only `Literal`s.
2211 ///
2212 /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and
2213 /// the empty string are not (even though they contain sub-expressions that
2214 /// are literals).
2215#[inline]
2216pub fn is_literal(&self) -> bool {
2217self.0.literal
2218 }
22192220/// Return true if and only if this HIR is either a simple literal or an
2221 /// alternation of simple literals. This is only
2222 /// true when this HIR expression is either itself a `Literal` or a
2223 /// concatenation of only `Literal`s or an alternation of only `Literal`s.
2224 ///
2225 /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
2226 /// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not
2227 /// (even though that contain sub-expressions that are literals).
2228#[inline]
2229pub fn is_alternation_literal(&self) -> bool {
2230self.0.alternation_literal
2231 }
22322233/// Returns the total amount of heap memory usage, in bytes, used by this
2234 /// `Properties` value.
2235#[inline]
2236pub fn memory_usage(&self) -> usize {
2237 core::mem::size_of::<PropertiesI>()
2238 }
22392240/// Returns a new set of properties that corresponds to the union of the
2241 /// iterator of properties given.
2242 ///
2243 /// This is useful when one has multiple `Hir` expressions and wants
2244 /// to combine them into a single alternation without constructing the
2245 /// corresponding `Hir`. This routine provides a way of combining the
2246 /// properties of each `Hir` expression into one set of properties
2247 /// representing the union of those expressions.
2248 ///
2249 /// # Example: union with HIRs that never match
2250 ///
2251 /// This example shows that unioning properties together with one that
2252 /// represents a regex that never matches will "poison" certain attributes,
2253 /// like the minimum and maximum lengths.
2254 ///
2255 /// ```
2256 /// use regex_syntax::{hir::Properties, parse};
2257 ///
2258 /// let hir1 = parse("ab?c?")?;
2259 /// assert_eq!(Some(1), hir1.properties().minimum_len());
2260 /// assert_eq!(Some(3), hir1.properties().maximum_len());
2261 ///
2262 /// let hir2 = parse(r"[a&&b]")?;
2263 /// assert_eq!(None, hir2.properties().minimum_len());
2264 /// assert_eq!(None, hir2.properties().maximum_len());
2265 ///
2266 /// let hir3 = parse(r"wxy?z?")?;
2267 /// assert_eq!(Some(2), hir3.properties().minimum_len());
2268 /// assert_eq!(Some(4), hir3.properties().maximum_len());
2269 ///
2270 /// let unioned = Properties::union([
2271 /// hir1.properties(),
2272 /// hir2.properties(),
2273 /// hir3.properties(),
2274 /// ]);
2275 /// assert_eq!(None, unioned.minimum_len());
2276 /// assert_eq!(None, unioned.maximum_len());
2277 ///
2278 /// # Ok::<(), Box<dyn std::error::Error>>(())
2279 /// ```
2280 ///
2281 /// The maximum length can also be "poisoned" by a pattern that has no
2282 /// upper bound on the length of a match. The minimum length remains
2283 /// unaffected:
2284 ///
2285 /// ```
2286 /// use regex_syntax::{hir::Properties, parse};
2287 ///
2288 /// let hir1 = parse("ab?c?")?;
2289 /// assert_eq!(Some(1), hir1.properties().minimum_len());
2290 /// assert_eq!(Some(3), hir1.properties().maximum_len());
2291 ///
2292 /// let hir2 = parse(r"a+")?;
2293 /// assert_eq!(Some(1), hir2.properties().minimum_len());
2294 /// assert_eq!(None, hir2.properties().maximum_len());
2295 ///
2296 /// let hir3 = parse(r"wxy?z?")?;
2297 /// assert_eq!(Some(2), hir3.properties().minimum_len());
2298 /// assert_eq!(Some(4), hir3.properties().maximum_len());
2299 ///
2300 /// let unioned = Properties::union([
2301 /// hir1.properties(),
2302 /// hir2.properties(),
2303 /// hir3.properties(),
2304 /// ]);
2305 /// assert_eq!(Some(1), unioned.minimum_len());
2306 /// assert_eq!(None, unioned.maximum_len());
2307 ///
2308 /// # Ok::<(), Box<dyn std::error::Error>>(())
2309 /// ```
2310pub fn union<I, P>(props: I) -> Properties2311where
2312I: IntoIterator<Item = P>,
2313 P: core::borrow::Borrow<Properties>,
2314 {
2315let mut it = props.into_iter().peekable();
2316// While empty alternations aren't possible, we still behave as if they
2317 // are. When we have an empty alternate, then clearly the look-around
2318 // prefix and suffix is empty. Otherwise, it is the intersection of all
2319 // prefixes and suffixes (respectively) of the branches.
2320let fix = if it.peek().is_none() {
2321LookSet::empty()
2322 } else {
2323LookSet::full()
2324 };
2325// And also, an empty alternate means we have 0 static capture groups,
2326 // but we otherwise start with the number corresponding to the first
2327 // alternate. If any subsequent alternate has a different number of
2328 // static capture groups, then we overall have a variation and not a
2329 // static number of groups.
2330let static_explicit_captures_len =
2331it.peek().and_then(|p| p.borrow().static_explicit_captures_len());
2332// The base case is an empty alternation, which matches nothing.
2333 // Note though that empty alternations aren't possible, because the
2334 // Hir::alternation smart constructor rewrites those as empty character
2335 // classes.
2336let mut props = PropertiesI {
2337 minimum_len: None,
2338 maximum_len: None,
2339 look_set: LookSet::empty(),
2340 look_set_prefix: fix,
2341 look_set_suffix: fix,
2342 look_set_prefix_any: LookSet::empty(),
2343 look_set_suffix_any: LookSet::empty(),
2344 utf8: true,
2345 explicit_captures_len: 0,
2346static_explicit_captures_len,
2347 literal: false,
2348 alternation_literal: true,
2349 };
2350let (mut min_poisoned, mut max_poisoned) = (false, false);
2351// Handle properties that need to visit every child hir.
2352for prop in it {
2353let p = prop.borrow();
2354 props.look_set.set_union(p.look_set());
2355 props.look_set_prefix.set_intersect(p.look_set_prefix());
2356 props.look_set_suffix.set_intersect(p.look_set_suffix());
2357 props.look_set_prefix_any.set_union(p.look_set_prefix_any());
2358 props.look_set_suffix_any.set_union(p.look_set_suffix_any());
2359 props.utf8 = props.utf8 && p.is_utf8();
2360 props.explicit_captures_len = props
2361 .explicit_captures_len
2362 .saturating_add(p.explicit_captures_len());
2363if props.static_explicit_captures_len
2364 != p.static_explicit_captures_len()
2365 {
2366 props.static_explicit_captures_len = None;
2367 }
2368 props.alternation_literal =
2369 props.alternation_literal && p.is_literal();
2370if !min_poisoned {
2371if let Some(xmin) = p.minimum_len() {
2372if props.minimum_len.map_or(true, |pmin| xmin < pmin) {
2373 props.minimum_len = Some(xmin);
2374 }
2375 } else {
2376 props.minimum_len = None;
2377 min_poisoned = true;
2378 }
2379 }
2380if !max_poisoned {
2381if let Some(xmax) = p.maximum_len() {
2382if props.maximum_len.map_or(true, |pmax| xmax > pmax) {
2383 props.maximum_len = Some(xmax);
2384 }
2385 } else {
2386 props.maximum_len = None;
2387 max_poisoned = true;
2388 }
2389 }
2390 }
2391Properties(Box::new(props))
2392 }
2393}
23942395impl Properties {
2396/// Create a new set of HIR properties for an empty regex.
2397fn empty() -> Properties {
2398let inner = PropertiesI {
2399 minimum_len: Some(0),
2400 maximum_len: Some(0),
2401 look_set: LookSet::empty(),
2402 look_set_prefix: LookSet::empty(),
2403 look_set_suffix: LookSet::empty(),
2404 look_set_prefix_any: LookSet::empty(),
2405 look_set_suffix_any: LookSet::empty(),
2406// It is debatable whether an empty regex always matches at valid
2407 // UTF-8 boundaries. Strictly speaking, at a byte oriented view,
2408 // it is clearly false. There are, for example, many empty strings
2409 // between the bytes encoding a '☃'.
2410 //
2411 // However, when Unicode mode is enabled, the fundamental atom
2412 // of matching is really a codepoint. And in that scenario, an
2413 // empty regex is defined to only match at valid UTF-8 boundaries
2414 // and to never split a codepoint. It just so happens that this
2415 // enforcement is somewhat tricky to do for regexes that match
2416 // the empty string inside regex engines themselves. It usually
2417 // requires some layer above the regex engine to filter out such
2418 // matches.
2419 //
2420 // In any case, 'true' is really the only coherent option. If it
2421 // were false, for example, then 'a*' would also need to be false
2422 // since it too can match the empty string.
2423utf8: true,
2424 explicit_captures_len: 0,
2425 static_explicit_captures_len: Some(0),
2426 literal: false,
2427 alternation_literal: false,
2428 };
2429Properties(Box::new(inner))
2430 }
24312432/// Create a new set of HIR properties for a literal regex.
2433fn literal(lit: &Literal) -> Properties {
2434let inner = PropertiesI {
2435 minimum_len: Some(lit.0.len()),
2436 maximum_len: Some(lit.0.len()),
2437 look_set: LookSet::empty(),
2438 look_set_prefix: LookSet::empty(),
2439 look_set_suffix: LookSet::empty(),
2440 look_set_prefix_any: LookSet::empty(),
2441 look_set_suffix_any: LookSet::empty(),
2442 utf8: core::str::from_utf8(&lit.0).is_ok(),
2443 explicit_captures_len: 0,
2444 static_explicit_captures_len: Some(0),
2445 literal: true,
2446 alternation_literal: true,
2447 };
2448Properties(Box::new(inner))
2449 }
24502451/// Create a new set of HIR properties for a character class.
2452fn class(class: &Class) -> Properties {
2453let inner = PropertiesI {
2454 minimum_len: class.minimum_len(),
2455 maximum_len: class.maximum_len(),
2456 look_set: LookSet::empty(),
2457 look_set_prefix: LookSet::empty(),
2458 look_set_suffix: LookSet::empty(),
2459 look_set_prefix_any: LookSet::empty(),
2460 look_set_suffix_any: LookSet::empty(),
2461 utf8: class.is_utf8(),
2462 explicit_captures_len: 0,
2463 static_explicit_captures_len: Some(0),
2464 literal: false,
2465 alternation_literal: false,
2466 };
2467Properties(Box::new(inner))
2468 }
24692470/// Create a new set of HIR properties for a look-around assertion.
2471fn look(look: Look) -> Properties {
2472let inner = PropertiesI {
2473 minimum_len: Some(0),
2474 maximum_len: Some(0),
2475 look_set: LookSet::singleton(look),
2476 look_set_prefix: LookSet::singleton(look),
2477 look_set_suffix: LookSet::singleton(look),
2478 look_set_prefix_any: LookSet::singleton(look),
2479 look_set_suffix_any: LookSet::singleton(look),
2480// This requires a little explanation. Basically, we don't consider
2481 // matching an empty string to be equivalent to matching invalid
2482 // UTF-8, even though technically matching every empty string will
2483 // split the UTF-8 encoding of a single codepoint when treating a
2484 // UTF-8 encoded string as a sequence of bytes. Our defense here is
2485 // that in such a case, a codepoint should logically be treated as
2486 // the fundamental atom for matching, and thus the only valid match
2487 // points are between codepoints and not bytes.
2488 //
2489 // More practically, this is true here because it's also true
2490 // for 'Hir::empty()', otherwise something like 'a*' would be
2491 // considered to match invalid UTF-8. That in turn makes this
2492 // property borderline useless.
2493utf8: true,
2494 explicit_captures_len: 0,
2495 static_explicit_captures_len: Some(0),
2496 literal: false,
2497 alternation_literal: false,
2498 };
2499Properties(Box::new(inner))
2500 }
25012502/// Create a new set of HIR properties for a repetition.
2503fn repetition(rep: &Repetition) -> Properties {
2504let p = rep.sub.properties();
2505let minimum_len = p.minimum_len().map(|child_min| {
2506let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX);
2507child_min.saturating_mul(rep_min)
2508 });
2509let maximum_len = rep.max.and_then(|rep_max| {
2510let rep_max = usize::try_from(rep_max).ok()?;
2511let child_max = p.maximum_len()?;
2512child_max.checked_mul(rep_max)
2513 });
25142515let mut inner = PropertiesI {
2516minimum_len,
2517maximum_len,
2518 look_set: p.look_set(),
2519 look_set_prefix: LookSet::empty(),
2520 look_set_suffix: LookSet::empty(),
2521 look_set_prefix_any: p.look_set_prefix_any(),
2522 look_set_suffix_any: p.look_set_suffix_any(),
2523 utf8: p.is_utf8(),
2524 explicit_captures_len: p.explicit_captures_len(),
2525 static_explicit_captures_len: p.static_explicit_captures_len(),
2526 literal: false,
2527 alternation_literal: false,
2528 };
2529// If the repetition operator can match the empty string, then its
2530 // lookset prefix and suffixes themselves remain empty since they are
2531 // no longer required to match.
2532if rep.min > 0 {
2533inner.look_set_prefix = p.look_set_prefix();
2534inner.look_set_suffix = p.look_set_suffix();
2535 }
2536// If the static captures len of the sub-expression is not known or
2537 // is greater than zero, then it automatically propagates to the
2538 // repetition, regardless of the repetition. Otherwise, it might
2539 // change, but only when the repetition can match 0 times.
2540if rep.min == 0
2541&& inner.static_explicit_captures_len.map_or(false, |len| len > 0)
2542 {
2543// If we require a match 0 times, then our captures len is
2544 // guaranteed to be zero. Otherwise, if we *can* match the empty
2545 // string, then it's impossible to know how many captures will be
2546 // in the resulting match.
2547if rep.max == Some(0) {
2548inner.static_explicit_captures_len = Some(0);
2549 } else {
2550inner.static_explicit_captures_len = None;
2551 }
2552 }
2553Properties(Box::new(inner))
2554 }
25552556/// Create a new set of HIR properties for a capture.
2557fn capture(capture: &Capture) -> Properties {
2558let p = capture.sub.properties();
2559Properties(Box::new(PropertiesI {
2560 explicit_captures_len: p.explicit_captures_len().saturating_add(1),
2561 static_explicit_captures_len: p2562 .static_explicit_captures_len()
2563 .map(|len| len.saturating_add(1)),
2564 literal: false,
2565 alternation_literal: false,
2566 ..*p.0.clone()
2567 }))
2568 }
25692570/// Create a new set of HIR properties for a concatenation.
2571fn concat(concat: &[Hir]) -> Properties {
2572// The base case is an empty concatenation, which matches the empty
2573 // string. Note though that empty concatenations aren't possible,
2574 // because the Hir::concat smart constructor rewrites those as
2575 // Hir::empty.
2576let mut props = PropertiesI {
2577 minimum_len: Some(0),
2578 maximum_len: Some(0),
2579 look_set: LookSet::empty(),
2580 look_set_prefix: LookSet::empty(),
2581 look_set_suffix: LookSet::empty(),
2582 look_set_prefix_any: LookSet::empty(),
2583 look_set_suffix_any: LookSet::empty(),
2584 utf8: true,
2585 explicit_captures_len: 0,
2586 static_explicit_captures_len: Some(0),
2587 literal: true,
2588 alternation_literal: true,
2589 };
2590// Handle properties that need to visit every child hir.
2591for x in concat.iter() {
2592let p = x.properties();
2593 props.look_set.set_union(p.look_set());
2594 props.utf8 = props.utf8 && p.is_utf8();
2595 props.explicit_captures_len = props
2596 .explicit_captures_len
2597 .saturating_add(p.explicit_captures_len());
2598 props.static_explicit_captures_len = p
2599 .static_explicit_captures_len()
2600 .and_then(|len1| {
2601Some((len1, props.static_explicit_captures_len?))
2602 })
2603 .and_then(|(len1, len2)| Some(len1.saturating_add(len2)));
2604 props.literal = props.literal && p.is_literal();
2605 props.alternation_literal =
2606 props.alternation_literal && p.is_alternation_literal();
2607if let Some(minimum_len) = props.minimum_len {
2608match p.minimum_len() {
2609None => props.minimum_len = None,
2610Some(len) => {
2611// We use saturating arithmetic here because the
2612 // minimum is just a lower bound. We can't go any
2613 // higher than what our number types permit.
2614props.minimum_len =
2615Some(minimum_len.saturating_add(len));
2616 }
2617 }
2618 }
2619if let Some(maximum_len) = props.maximum_len {
2620match p.maximum_len() {
2621None => props.maximum_len = None,
2622Some(len) => {
2623 props.maximum_len = maximum_len.checked_add(len)
2624 }
2625 }
2626 }
2627 }
2628// Handle the prefix properties, which only requires visiting
2629 // child exprs until one matches more than the empty string.
2630let mut it = concat.iter();
2631while let Some(x) = it.next() {
2632 props.look_set_prefix.set_union(x.properties().look_set_prefix());
2633 props
2634 .look_set_prefix_any
2635 .set_union(x.properties().look_set_prefix_any());
2636if x.properties().maximum_len().map_or(true, |x| x > 0) {
2637break;
2638 }
2639 }
2640// Same thing for the suffix properties, but in reverse.
2641let mut it = concat.iter().rev();
2642while let Some(x) = it.next() {
2643 props.look_set_suffix.set_union(x.properties().look_set_suffix());
2644 props
2645 .look_set_suffix_any
2646 .set_union(x.properties().look_set_suffix_any());
2647if x.properties().maximum_len().map_or(true, |x| x > 0) {
2648break;
2649 }
2650 }
2651Properties(Box::new(props))
2652 }
26532654/// Create a new set of HIR properties for a concatenation.
2655fn alternation(alts: &[Hir]) -> Properties {
2656Properties::union(alts.iter().map(|hir| hir.properties()))
2657 }
2658}
26592660/// A set of look-around assertions.
2661///
2662/// This is useful for efficiently tracking look-around assertions. For
2663/// example, an [`Hir`] provides properties that return `LookSet`s.
2664#[derive(#[automatically_derived]
impl ::core::clone::Clone for LookSet {
#[inline]
fn clone(&self) -> LookSet {
let _: ::core::clone::AssertParamIsClone<u32>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for LookSet { }Copy, #[automatically_derived]
impl ::core::default::Default for LookSet {
#[inline]
fn default() -> LookSet {
LookSet { bits: ::core::default::Default::default() }
}
}Default, #[automatically_derived]
impl ::core::cmp::Eq for LookSet {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<u32>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for LookSet {
#[inline]
fn eq(&self, other: &LookSet) -> bool { self.bits == other.bits }
}PartialEq)]
2665pub struct LookSet {
2666/// The underlying representation this set is exposed to make it possible
2667 /// to store it somewhere efficiently. The representation is that
2668 /// of a bitset, where each assertion occupies bit `i` where `i =
2669 /// Look::as_repr()`.
2670 ///
2671 /// Note that users of this internal representation must permit the full
2672 /// range of `u16` values to be represented. For example, even if the
2673 /// current implementation only makes use of the 10 least significant bits,
2674 /// it may use more bits in a future semver compatible release.
2675pub bits: u32,
2676}
26772678impl LookSet {
2679/// Create an empty set of look-around assertions.
2680#[inline]
2681pub fn empty() -> LookSet {
2682LookSet { bits: 0 }
2683 }
26842685/// Create a full set of look-around assertions.
2686 ///
2687 /// This set contains all possible look-around assertions.
2688#[inline]
2689pub fn full() -> LookSet {
2690LookSet { bits: !0 }
2691 }
26922693/// Create a look-around set containing the look-around assertion given.
2694 ///
2695 /// This is a convenience routine for creating an empty set and inserting
2696 /// one look-around assertions.
2697#[inline]
2698pub fn singleton(look: Look) -> LookSet {
2699LookSet::empty().insert(look)
2700 }
27012702/// Returns the total number of look-around assertions in this set.
2703#[inline]
2704pub fn len(self) -> usize {
2705// OK because max value always fits in a u8, which in turn always
2706 // fits in a usize, regardless of target.
2707usize::try_from(self.bits.count_ones()).unwrap()
2708 }
27092710/// Returns true if and only if this set is empty.
2711#[inline]
2712pub fn is_empty(self) -> bool {
2713self.len() == 0
2714}
27152716/// Returns true if and only if the given look-around assertion is in this
2717 /// set.
2718#[inline]
2719pub fn contains(self, look: Look) -> bool {
2720self.bits & look.as_repr() != 0
2721}
27222723/// Returns true if and only if this set contains any anchor assertions.
2724 /// This includes both "start/end of haystack" and "start/end of line."
2725#[inline]
2726pub fn contains_anchor(&self) -> bool {
2727self.contains_anchor_haystack() || self.contains_anchor_line()
2728 }
27292730/// Returns true if and only if this set contains any "start/end of
2731 /// haystack" anchors. This doesn't include "start/end of line" anchors.
2732#[inline]
2733pub fn contains_anchor_haystack(&self) -> bool {
2734self.contains(Look::Start) || self.contains(Look::End)
2735 }
27362737/// Returns true if and only if this set contains any "start/end of line"
2738 /// anchors. This doesn't include "start/end of haystack" anchors. This
2739 /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
2740#[inline]
2741pub fn contains_anchor_line(&self) -> bool {
2742self.contains(Look::StartLF)
2743 || self.contains(Look::EndLF)
2744 || self.contains(Look::StartCRLF)
2745 || self.contains(Look::EndCRLF)
2746 }
27472748/// Returns true if and only if this set contains any "start/end of line"
2749 /// anchors that only treat `\n` as line terminators. This does not include
2750 /// haystack anchors or CRLF aware line anchors.
2751#[inline]
2752pub fn contains_anchor_lf(&self) -> bool {
2753self.contains(Look::StartLF) || self.contains(Look::EndLF)
2754 }
27552756/// Returns true if and only if this set contains any "start/end of line"
2757 /// anchors that are CRLF-aware. This doesn't include "start/end of
2758 /// haystack" or "start/end of line-feed" anchors.
2759#[inline]
2760pub fn contains_anchor_crlf(&self) -> bool {
2761self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
2762 }
27632764/// Returns true if and only if this set contains any word boundary or
2765 /// negated word boundary assertions. This include both Unicode and ASCII
2766 /// word boundaries.
2767#[inline]
2768pub fn contains_word(self) -> bool {
2769self.contains_word_unicode() || self.contains_word_ascii()
2770 }
27712772/// Returns true if and only if this set contains any Unicode word boundary
2773 /// or negated Unicode word boundary assertions.
2774#[inline]
2775pub fn contains_word_unicode(self) -> bool {
2776self.contains(Look::WordUnicode)
2777 || self.contains(Look::WordUnicodeNegate)
2778 || self.contains(Look::WordStartUnicode)
2779 || self.contains(Look::WordEndUnicode)
2780 || self.contains(Look::WordStartHalfUnicode)
2781 || self.contains(Look::WordEndHalfUnicode)
2782 }
27832784/// Returns true if and only if this set contains any ASCII word boundary
2785 /// or negated ASCII word boundary assertions.
2786#[inline]
2787pub fn contains_word_ascii(self) -> bool {
2788self.contains(Look::WordAscii)
2789 || self.contains(Look::WordAsciiNegate)
2790 || self.contains(Look::WordStartAscii)
2791 || self.contains(Look::WordEndAscii)
2792 || self.contains(Look::WordStartHalfAscii)
2793 || self.contains(Look::WordEndHalfAscii)
2794 }
27952796/// Returns an iterator over all of the look-around assertions in this set.
2797#[inline]
2798pub fn iter(self) -> LookSetIter {
2799LookSetIter { set: self }
2800 }
28012802/// Return a new set that is equivalent to the original, but with the given
2803 /// assertion added to it. If the assertion is already in the set, then the
2804 /// returned set is equivalent to the original.
2805#[inline]
2806pub fn insert(self, look: Look) -> LookSet {
2807LookSet { bits: self.bits | look.as_repr() }
2808 }
28092810/// Updates this set in place with the result of inserting the given
2811 /// assertion into this set.
2812#[inline]
2813pub fn set_insert(&mut self, look: Look) {
2814*self = self.insert(look);
2815 }
28162817/// Return a new set that is equivalent to the original, but with the given
2818 /// assertion removed from it. If the assertion is not in the set, then the
2819 /// returned set is equivalent to the original.
2820#[inline]
2821pub fn remove(self, look: Look) -> LookSet {
2822LookSet { bits: self.bits & !look.as_repr() }
2823 }
28242825/// Updates this set in place with the result of removing the given
2826 /// assertion from this set.
2827#[inline]
2828pub fn set_remove(&mut self, look: Look) {
2829*self = self.remove(look);
2830 }
28312832/// Returns a new set that is the result of subtracting the given set from
2833 /// this set.
2834#[inline]
2835pub fn subtract(self, other: LookSet) -> LookSet {
2836LookSet { bits: self.bits & !other.bits }
2837 }
28382839/// Updates this set in place with the result of subtracting the given set
2840 /// from this set.
2841#[inline]
2842pub fn set_subtract(&mut self, other: LookSet) {
2843*self = self.subtract(other);
2844 }
28452846/// Returns a new set that is the union of this and the one given.
2847#[inline]
2848pub fn union(self, other: LookSet) -> LookSet {
2849LookSet { bits: self.bits | other.bits }
2850 }
28512852/// Updates this set in place with the result of unioning it with the one
2853 /// given.
2854#[inline]
2855pub fn set_union(&mut self, other: LookSet) {
2856*self = self.union(other);
2857 }
28582859/// Returns a new set that is the intersection of this and the one given.
2860#[inline]
2861pub fn intersect(self, other: LookSet) -> LookSet {
2862LookSet { bits: self.bits & other.bits }
2863 }
28642865/// Updates this set in place with the result of intersecting it with the
2866 /// one given.
2867#[inline]
2868pub fn set_intersect(&mut self, other: LookSet) {
2869*self = self.intersect(other);
2870 }
28712872/// Return a `LookSet` from the slice given as a native endian 32-bit
2873 /// integer.
2874 ///
2875 /// # Panics
2876 ///
2877 /// This panics if `slice.len() < 4`.
2878#[inline]
2879pub fn read_repr(slice: &[u8]) -> LookSet {
2880let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
2881LookSet { bits }
2882 }
28832884/// Write a `LookSet` as a native endian 32-bit integer to the beginning
2885 /// of the slice given.
2886 ///
2887 /// # Panics
2888 ///
2889 /// This panics if `slice.len() < 4`.
2890#[inline]
2891pub fn write_repr(self, slice: &mut [u8]) {
2892let raw = self.bits.to_ne_bytes();
2893slice[0] = raw[0];
2894slice[1] = raw[1];
2895slice[2] = raw[2];
2896slice[3] = raw[3];
2897 }
2898}
28992900impl core::fmt::Debugfor LookSet {
2901fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2902if self.is_empty() {
2903return f.write_fmt(format_args!("∅"))write!(f, "∅");
2904 }
2905for look in self.iter() {
2906f.write_fmt(format_args!("{0}", look.as_char()))write!(f, "{}", look.as_char())?;
2907 }
2908Ok(())
2909 }
2910}
29112912/// An iterator over all look-around assertions in a [`LookSet`].
2913///
2914/// This iterator is created by [`LookSet::iter`].
2915#[derive(#[automatically_derived]
impl ::core::clone::Clone for LookSetIter {
#[inline]
fn clone(&self) -> LookSetIter {
LookSetIter { set: ::core::clone::Clone::clone(&self.set) }
}
}Clone, #[automatically_derived]
impl ::core::fmt::Debug for LookSetIter {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field1_finish(f, "LookSetIter",
"set", &&self.set)
}
}Debug)]
2916pub struct LookSetIter {
2917 set: LookSet,
2918}
29192920impl Iteratorfor LookSetIter {
2921type Item = Look;
29222923#[inline]
2924fn next(&mut self) -> Option<Look> {
2925if self.set.is_empty() {
2926return None;
2927 }
2928// We'll never have more than u8::MAX distinct look-around assertions,
2929 // so 'bit' will always fit into a u16.
2930let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
2931let look = Look::from_repr(1 << bit)?;
2932self.set = self.set.remove(look);
2933Some(look)
2934 }
2935}
29362937/// Given a sequence of HIR values where each value corresponds to a Unicode
2938/// class (or an all-ASCII byte class), return a single Unicode class
2939/// corresponding to the union of the classes found.
2940fn class_chars(hirs: &[Hir]) -> Option<Class> {
2941let mut cls = ClassUnicode::new(::alloc::vec::Vec::new()vec![]);
2942for hir in hirs.iter() {
2943match *hir.kind() {
2944 HirKind::Class(Class::Unicode(ref cls2)) => {
2945 cls.union(cls2);
2946 }
2947 HirKind::Class(Class::Bytes(ref cls2)) => {
2948 cls.union(&cls2.to_unicode_class()?);
2949 }
2950_ => return None,
2951 };
2952 }
2953Some(Class::Unicode(cls))
2954}
29552956/// Given a sequence of HIR values where each value corresponds to a byte class
2957/// (or an all-ASCII Unicode class), return a single byte class corresponding
2958/// to the union of the classes found.
2959fn class_bytes(hirs: &[Hir]) -> Option<Class> {
2960let mut cls = ClassBytes::new(::alloc::vec::Vec::new()vec![]);
2961for hir in hirs.iter() {
2962match *hir.kind() {
2963 HirKind::Class(Class::Unicode(ref cls2)) => {
2964 cls.union(&cls2.to_byte_class()?);
2965 }
2966 HirKind::Class(Class::Bytes(ref cls2)) => {
2967 cls.union(cls2);
2968 }
2969_ => return None,
2970 };
2971 }
2972Some(Class::Bytes(cls))
2973}
29742975/// Given a sequence of HIR values where each value corresponds to a literal
2976/// that is a single `char`, return that sequence of `char`s. Otherwise return
2977/// None. No deduplication is done.
2978fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> {
2979let mut singletons = ::alloc::vec::Vec::new()vec![];
2980for hir in hirs.iter() {
2981let literal = match *hir.kind() {
2982 HirKind::Literal(Literal(ref bytes)) => bytes,
2983_ => return None,
2984 };
2985let ch = match crate::debug::utf8_decode(literal) {
2986None => return None,
2987Some(Err(_)) => return None,
2988Some(Ok(ch)) => ch,
2989 };
2990if literal.len() != ch.len_utf8() {
2991return None;
2992 }
2993 singletons.push(ch);
2994 }
2995Some(singletons)
2996}
29972998/// Given a sequence of HIR values where each value corresponds to a literal
2999/// that is a single byte, return that sequence of bytes. Otherwise return
3000/// None. No deduplication is done.
3001fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> {
3002let mut singletons = ::alloc::vec::Vec::new()vec![];
3003for hir in hirs.iter() {
3004let literal = match *hir.kind() {
3005 HirKind::Literal(Literal(ref bytes)) => bytes,
3006_ => return None,
3007 };
3008if literal.len() != 1 {
3009return None;
3010 }
3011 singletons.push(literal[0]);
3012 }
3013Some(singletons)
3014}
30153016/// Looks for a common prefix in the list of alternation branches given. If one
3017/// is found, then an equivalent but (hopefully) simplified Hir is returned.
3018/// Otherwise, the original given list of branches is returned unmodified.
3019///
3020/// This is not quite as good as it could be. Right now, it requires that
3021/// all branches are 'Concat' expressions. It also doesn't do well with
3022/// literals. For example, given 'foofoo|foobar', it will not refactor it to
3023/// 'foo(?:foo|bar)' because literals are flattened into their own special
3024/// concatenation. (One wonders if perhaps 'Literal' should be a single atom
3025/// instead of a string of bytes because of this. Otherwise, handling the
3026/// current representation in this routine will be pretty gnarly. Sigh.)
3027fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
3028if hirs.len() <= 1 {
3029return Err(hirs);
3030 }
3031let mut prefix = match hirs[0].kind() {
3032 HirKind::Concat(ref xs) => &**xs,
3033_ => return Err(hirs),
3034 };
3035if prefix.is_empty() {
3036return Err(hirs);
3037 }
3038for h in hirs.iter().skip(1) {
3039let concat = match h.kind() {
3040 HirKind::Concat(ref xs) => xs,
3041_ => return Err(hirs),
3042 };
3043let common_len = prefix
3044 .iter()
3045 .zip(concat.iter())
3046 .take_while(|(x, y)| x == y)
3047 .count();
3048 prefix = &prefix[..common_len];
3049if prefix.is_empty() {
3050return Err(hirs);
3051 }
3052 }
3053let len = prefix.len();
3054match (&0, &len) {
(left_val, right_val) => {
if *left_val == *right_val {
let kind = ::core::panicking::AssertKind::Ne;
::core::panicking::assert_failed(kind, &*left_val, &*right_val,
::core::option::Option::None);
}
}
};assert_ne!(0, len);
3055let mut prefix_concat = ::alloc::vec::Vec::new()vec![];
3056let mut suffix_alts = ::alloc::vec::Vec::new()vec![];
3057for h in hirs {
3058let mut concat = match h.into_kind() {
3059 HirKind::Concat(xs) => xs,
3060// We required all sub-expressions to be
3061 // concats above, so we're only here if we
3062 // have a concat.
3063_ => ::core::panicking::panic("internal error: entered unreachable code")unreachable!(),
3064 };
3065 suffix_alts.push(Hir::concat(concat.split_off(len)));
3066if prefix_concat.is_empty() {
3067 prefix_concat = concat;
3068 }
3069 }
3070let mut concat = prefix_concat;
3071concat.push(Hir::alternation(suffix_alts));
3072Ok(Hir::concat(concat))
3073}
30743075#[cfg(test)]
3076mod tests {
3077use super::*;
30783079fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
3080let ranges: Vec<ClassUnicodeRange> = ranges
3081 .iter()
3082 .map(|&(s, e)| ClassUnicodeRange::new(s, e))
3083 .collect();
3084 ClassUnicode::new(ranges)
3085 }
30863087fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
3088let ranges: Vec<ClassBytesRange> =
3089 ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
3090 ClassBytes::new(ranges)
3091 }
30923093fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
3094 cls.iter().map(|x| (x.start(), x.end())).collect()
3095 }
30963097#[cfg(feature = "unicode-case")]
3098fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
3099let mut cls_ = cls.clone();
3100 cls_.case_fold_simple();
3101 cls_
3102 }
31033104fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3105let mut cls_ = cls1.clone();
3106 cls_.union(cls2);
3107 cls_
3108 }
31093110fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3111let mut cls_ = cls1.clone();
3112 cls_.intersect(cls2);
3113 cls_
3114 }
31153116fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3117let mut cls_ = cls1.clone();
3118 cls_.difference(cls2);
3119 cls_
3120 }
31213122fn usymdifference(
3123 cls1: &ClassUnicode,
3124 cls2: &ClassUnicode,
3125 ) -> ClassUnicode {
3126let mut cls_ = cls1.clone();
3127 cls_.symmetric_difference(cls2);
3128 cls_
3129 }
31303131fn unegate(cls: &ClassUnicode) -> ClassUnicode {
3132let mut cls_ = cls.clone();
3133 cls_.negate();
3134 cls_
3135 }
31363137fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
3138 cls.iter().map(|x| (x.start(), x.end())).collect()
3139 }
31403141fn bcasefold(cls: &ClassBytes) -> ClassBytes {
3142let mut cls_ = cls.clone();
3143 cls_.case_fold_simple();
3144 cls_
3145 }
31463147fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3148let mut cls_ = cls1.clone();
3149 cls_.union(cls2);
3150 cls_
3151 }
31523153fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3154let mut cls_ = cls1.clone();
3155 cls_.intersect(cls2);
3156 cls_
3157 }
31583159fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3160let mut cls_ = cls1.clone();
3161 cls_.difference(cls2);
3162 cls_
3163 }
31643165fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3166let mut cls_ = cls1.clone();
3167 cls_.symmetric_difference(cls2);
3168 cls_
3169 }
31703171fn bnegate(cls: &ClassBytes) -> ClassBytes {
3172let mut cls_ = cls.clone();
3173 cls_.negate();
3174 cls_
3175 }
31763177#[test]
3178fn class_range_canonical_unicode() {
3179let range = ClassUnicodeRange::new('\u{00FF}', '\0');
3180assert_eq!('\0', range.start());
3181assert_eq!('\u{00FF}', range.end());
3182 }
31833184#[test]
3185fn class_range_canonical_bytes() {
3186let range = ClassBytesRange::new(b'\xFF', b'\0');
3187assert_eq!(b'\0', range.start());
3188assert_eq!(b'\xFF', range.end());
3189 }
31903191#[test]
3192fn class_canonicalize_unicode() {
3193let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3194let expected = vec![('a', 'c'), ('x', 'z')];
3195assert_eq!(expected, uranges(&cls));
31963197let cls = uclass(&[('x', 'z'), ('a', 'c')]);
3198let expected = vec![('a', 'c'), ('x', 'z')];
3199assert_eq!(expected, uranges(&cls));
32003201let cls = uclass(&[('x', 'z'), ('w', 'y')]);
3202let expected = vec![('w', 'z')];
3203assert_eq!(expected, uranges(&cls));
32043205let cls = uclass(&[
3206 ('c', 'f'),
3207 ('a', 'g'),
3208 ('d', 'j'),
3209 ('a', 'c'),
3210 ('m', 'p'),
3211 ('l', 's'),
3212 ]);
3213let expected = vec![('a', 'j'), ('l', 's')];
3214assert_eq!(expected, uranges(&cls));
32153216let cls = uclass(&[('x', 'z'), ('u', 'w')]);
3217let expected = vec![('u', 'z')];
3218assert_eq!(expected, uranges(&cls));
32193220let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
3221let expected = vec![('\x00', '\u{10FFFF}')];
3222assert_eq!(expected, uranges(&cls));
32233224let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3225let expected = vec![('a', 'b')];
3226assert_eq!(expected, uranges(&cls));
3227 }
32283229#[test]
3230fn class_canonicalize_bytes() {
3231let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3232let expected = vec![(b'a', b'c'), (b'x', b'z')];
3233assert_eq!(expected, branges(&cls));
32343235let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
3236let expected = vec![(b'a', b'c'), (b'x', b'z')];
3237assert_eq!(expected, branges(&cls));
32383239let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
3240let expected = vec![(b'w', b'z')];
3241assert_eq!(expected, branges(&cls));
32423243let cls = bclass(&[
3244 (b'c', b'f'),
3245 (b'a', b'g'),
3246 (b'd', b'j'),
3247 (b'a', b'c'),
3248 (b'm', b'p'),
3249 (b'l', b's'),
3250 ]);
3251let expected = vec![(b'a', b'j'), (b'l', b's')];
3252assert_eq!(expected, branges(&cls));
32533254let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
3255let expected = vec![(b'u', b'z')];
3256assert_eq!(expected, branges(&cls));
32573258let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
3259let expected = vec![(b'\x00', b'\xFF')];
3260assert_eq!(expected, branges(&cls));
32613262let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3263let expected = vec![(b'a', b'b')];
3264assert_eq!(expected, branges(&cls));
3265 }
32663267#[test]
3268 #[cfg(feature = "unicode-case")]
3269fn class_case_fold_unicode() {
3270let cls = uclass(&[
3271 ('C', 'F'),
3272 ('A', 'G'),
3273 ('D', 'J'),
3274 ('A', 'C'),
3275 ('M', 'P'),
3276 ('L', 'S'),
3277 ('c', 'f'),
3278 ]);
3279let expected = uclass(&[
3280 ('A', 'J'),
3281 ('L', 'S'),
3282 ('a', 'j'),
3283 ('l', 's'),
3284 ('\u{17F}', '\u{17F}'),
3285 ]);
3286assert_eq!(expected, ucasefold(&cls));
32873288let cls = uclass(&[('A', 'Z')]);
3289let expected = uclass(&[
3290 ('A', 'Z'),
3291 ('a', 'z'),
3292 ('\u{17F}', '\u{17F}'),
3293 ('\u{212A}', '\u{212A}'),
3294 ]);
3295assert_eq!(expected, ucasefold(&cls));
32963297let cls = uclass(&[('a', 'z')]);
3298let expected = uclass(&[
3299 ('A', 'Z'),
3300 ('a', 'z'),
3301 ('\u{17F}', '\u{17F}'),
3302 ('\u{212A}', '\u{212A}'),
3303 ]);
3304assert_eq!(expected, ucasefold(&cls));
33053306let cls = uclass(&[('A', 'A'), ('_', '_')]);
3307let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
3308assert_eq!(expected, ucasefold(&cls));
33093310let cls = uclass(&[('A', 'A'), ('=', '=')]);
3311let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
3312assert_eq!(expected, ucasefold(&cls));
33133314let cls = uclass(&[('\x00', '\x10')]);
3315assert_eq!(cls, ucasefold(&cls));
33163317let cls = uclass(&[('k', 'k')]);
3318let expected =
3319 uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
3320assert_eq!(expected, ucasefold(&cls));
33213322let cls = uclass(&[('@', '@')]);
3323assert_eq!(cls, ucasefold(&cls));
3324 }
33253326#[test]
3327 #[cfg(not(feature = "unicode-case"))]
3328fn class_case_fold_unicode_disabled() {
3329let mut cls = uclass(&[
3330 ('C', 'F'),
3331 ('A', 'G'),
3332 ('D', 'J'),
3333 ('A', 'C'),
3334 ('M', 'P'),
3335 ('L', 'S'),
3336 ('c', 'f'),
3337 ]);
3338assert!(cls.try_case_fold_simple().is_err());
3339 }
33403341#[test]
3342 #[should_panic]
3343 #[cfg(not(feature = "unicode-case"))]
3344fn class_case_fold_unicode_disabled_panics() {
3345let mut cls = uclass(&[
3346 ('C', 'F'),
3347 ('A', 'G'),
3348 ('D', 'J'),
3349 ('A', 'C'),
3350 ('M', 'P'),
3351 ('L', 'S'),
3352 ('c', 'f'),
3353 ]);
3354 cls.case_fold_simple();
3355 }
33563357#[test]
3358fn class_case_fold_bytes() {
3359let cls = bclass(&[
3360 (b'C', b'F'),
3361 (b'A', b'G'),
3362 (b'D', b'J'),
3363 (b'A', b'C'),
3364 (b'M', b'P'),
3365 (b'L', b'S'),
3366 (b'c', b'f'),
3367 ]);
3368let expected =
3369 bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
3370assert_eq!(expected, bcasefold(&cls));
33713372let cls = bclass(&[(b'A', b'Z')]);
3373let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3374assert_eq!(expected, bcasefold(&cls));
33753376let cls = bclass(&[(b'a', b'z')]);
3377let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3378assert_eq!(expected, bcasefold(&cls));
33793380let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
3381let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
3382assert_eq!(expected, bcasefold(&cls));
33833384let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
3385let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
3386assert_eq!(expected, bcasefold(&cls));
33873388let cls = bclass(&[(b'\x00', b'\x10')]);
3389assert_eq!(cls, bcasefold(&cls));
33903391let cls = bclass(&[(b'k', b'k')]);
3392let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
3393assert_eq!(expected, bcasefold(&cls));
33943395let cls = bclass(&[(b'@', b'@')]);
3396assert_eq!(cls, bcasefold(&cls));
3397 }
33983399#[test]
3400fn class_negate_unicode() {
3401let cls = uclass(&[('a', 'a')]);
3402let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
3403assert_eq!(expected, unegate(&cls));
34043405let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3406let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
3407assert_eq!(expected, unegate(&cls));
34083409let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3410let expected = uclass(&[
3411 ('\x00', '\x60'),
3412 ('\x64', '\x77'),
3413 ('\x7B', '\u{10FFFF}'),
3414 ]);
3415assert_eq!(expected, unegate(&cls));
34163417let cls = uclass(&[('\x00', 'a')]);
3418let expected = uclass(&[('\x62', '\u{10FFFF}')]);
3419assert_eq!(expected, unegate(&cls));
34203421let cls = uclass(&[('a', '\u{10FFFF}')]);
3422let expected = uclass(&[('\x00', '\x60')]);
3423assert_eq!(expected, unegate(&cls));
34243425let cls = uclass(&[('\x00', '\u{10FFFF}')]);
3426let expected = uclass(&[]);
3427assert_eq!(expected, unegate(&cls));
34283429let cls = uclass(&[]);
3430let expected = uclass(&[('\x00', '\u{10FFFF}')]);
3431assert_eq!(expected, unegate(&cls));
34323433let cls =
3434 uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
3435let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
3436assert_eq!(expected, unegate(&cls));
34373438let cls = uclass(&[('\x00', '\u{D7FF}')]);
3439let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3440assert_eq!(expected, unegate(&cls));
34413442let cls = uclass(&[('\x00', '\u{D7FE}')]);
3443let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
3444assert_eq!(expected, unegate(&cls));
34453446let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3447let expected = uclass(&[('\x00', '\u{D7FF}')]);
3448assert_eq!(expected, unegate(&cls));
34493450let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
3451let expected = uclass(&[('\x00', '\u{E000}')]);
3452assert_eq!(expected, unegate(&cls));
3453 }
34543455#[test]
3456fn class_negate_bytes() {
3457let cls = bclass(&[(b'a', b'a')]);
3458let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
3459assert_eq!(expected, bnegate(&cls));
34603461let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3462let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
3463assert_eq!(expected, bnegate(&cls));
34643465let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3466let expected = bclass(&[
3467 (b'\x00', b'\x60'),
3468 (b'\x64', b'\x77'),
3469 (b'\x7B', b'\xFF'),
3470 ]);
3471assert_eq!(expected, bnegate(&cls));
34723473let cls = bclass(&[(b'\x00', b'a')]);
3474let expected = bclass(&[(b'\x62', b'\xFF')]);
3475assert_eq!(expected, bnegate(&cls));
34763477let cls = bclass(&[(b'a', b'\xFF')]);
3478let expected = bclass(&[(b'\x00', b'\x60')]);
3479assert_eq!(expected, bnegate(&cls));
34803481let cls = bclass(&[(b'\x00', b'\xFF')]);
3482let expected = bclass(&[]);
3483assert_eq!(expected, bnegate(&cls));
34843485let cls = bclass(&[]);
3486let expected = bclass(&[(b'\x00', b'\xFF')]);
3487assert_eq!(expected, bnegate(&cls));
34883489let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
3490let expected = bclass(&[(b'\xFE', b'\xFE')]);
3491assert_eq!(expected, bnegate(&cls));
3492 }
34933494#[test]
3495fn class_union_unicode() {
3496let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
3497let cls2 = uclass(&[('a', 'z')]);
3498let expected = uclass(&[('a', 'z'), ('A', 'C')]);
3499assert_eq!(expected, uunion(&cls1, &cls2));
3500 }
35013502#[test]
3503fn class_union_bytes() {
3504let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
3505let cls2 = bclass(&[(b'a', b'z')]);
3506let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
3507assert_eq!(expected, bunion(&cls1, &cls2));
3508 }
35093510#[test]
3511fn class_intersect_unicode() {
3512let cls1 = uclass(&[]);
3513let cls2 = uclass(&[('a', 'a')]);
3514let expected = uclass(&[]);
3515assert_eq!(expected, uintersect(&cls1, &cls2));
35163517let cls1 = uclass(&[('a', 'a')]);
3518let cls2 = uclass(&[('a', 'a')]);
3519let expected = uclass(&[('a', 'a')]);
3520assert_eq!(expected, uintersect(&cls1, &cls2));
35213522let cls1 = uclass(&[('a', 'a')]);
3523let cls2 = uclass(&[('b', 'b')]);
3524let expected = uclass(&[]);
3525assert_eq!(expected, uintersect(&cls1, &cls2));
35263527let cls1 = uclass(&[('a', 'a')]);
3528let cls2 = uclass(&[('a', 'c')]);
3529let expected = uclass(&[('a', 'a')]);
3530assert_eq!(expected, uintersect(&cls1, &cls2));
35313532let cls1 = uclass(&[('a', 'b')]);
3533let cls2 = uclass(&[('a', 'c')]);
3534let expected = uclass(&[('a', 'b')]);
3535assert_eq!(expected, uintersect(&cls1, &cls2));
35363537let cls1 = uclass(&[('a', 'b')]);
3538let cls2 = uclass(&[('b', 'c')]);
3539let expected = uclass(&[('b', 'b')]);
3540assert_eq!(expected, uintersect(&cls1, &cls2));
35413542let cls1 = uclass(&[('a', 'b')]);
3543let cls2 = uclass(&[('c', 'd')]);
3544let expected = uclass(&[]);
3545assert_eq!(expected, uintersect(&cls1, &cls2));
35463547let cls1 = uclass(&[('b', 'c')]);
3548let cls2 = uclass(&[('a', 'd')]);
3549let expected = uclass(&[('b', 'c')]);
3550assert_eq!(expected, uintersect(&cls1, &cls2));
35513552let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3553let cls2 = uclass(&[('a', 'h')]);
3554let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3555assert_eq!(expected, uintersect(&cls1, &cls2));
35563557let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3558let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3559let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3560assert_eq!(expected, uintersect(&cls1, &cls2));
35613562let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
3563let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
3564let expected = uclass(&[]);
3565assert_eq!(expected, uintersect(&cls1, &cls2));
35663567let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3568let cls2 = uclass(&[('h', 'h')]);
3569let expected = uclass(&[('h', 'h')]);
3570assert_eq!(expected, uintersect(&cls1, &cls2));
35713572let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
3573let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
3574let expected = uclass(&[]);
3575assert_eq!(expected, uintersect(&cls1, &cls2));
35763577let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
3578let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
3579let expected = uclass(&[('b', 'f')]);
3580assert_eq!(expected, uintersect(&cls1, &cls2));
3581 }
35823583#[test]
3584fn class_intersect_bytes() {
3585let cls1 = bclass(&[]);
3586let cls2 = bclass(&[(b'a', b'a')]);
3587let expected = bclass(&[]);
3588assert_eq!(expected, bintersect(&cls1, &cls2));
35893590let cls1 = bclass(&[(b'a', b'a')]);
3591let cls2 = bclass(&[(b'a', b'a')]);
3592let expected = bclass(&[(b'a', b'a')]);
3593assert_eq!(expected, bintersect(&cls1, &cls2));
35943595let cls1 = bclass(&[(b'a', b'a')]);
3596let cls2 = bclass(&[(b'b', b'b')]);
3597let expected = bclass(&[]);
3598assert_eq!(expected, bintersect(&cls1, &cls2));
35993600let cls1 = bclass(&[(b'a', b'a')]);
3601let cls2 = bclass(&[(b'a', b'c')]);
3602let expected = bclass(&[(b'a', b'a')]);
3603assert_eq!(expected, bintersect(&cls1, &cls2));
36043605let cls1 = bclass(&[(b'a', b'b')]);
3606let cls2 = bclass(&[(b'a', b'c')]);
3607let expected = bclass(&[(b'a', b'b')]);
3608assert_eq!(expected, bintersect(&cls1, &cls2));
36093610let cls1 = bclass(&[(b'a', b'b')]);
3611let cls2 = bclass(&[(b'b', b'c')]);
3612let expected = bclass(&[(b'b', b'b')]);
3613assert_eq!(expected, bintersect(&cls1, &cls2));
36143615let cls1 = bclass(&[(b'a', b'b')]);
3616let cls2 = bclass(&[(b'c', b'd')]);
3617let expected = bclass(&[]);
3618assert_eq!(expected, bintersect(&cls1, &cls2));
36193620let cls1 = bclass(&[(b'b', b'c')]);
3621let cls2 = bclass(&[(b'a', b'd')]);
3622let expected = bclass(&[(b'b', b'c')]);
3623assert_eq!(expected, bintersect(&cls1, &cls2));
36243625let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3626let cls2 = bclass(&[(b'a', b'h')]);
3627let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3628assert_eq!(expected, bintersect(&cls1, &cls2));
36293630let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3631let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3632let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3633assert_eq!(expected, bintersect(&cls1, &cls2));
36343635let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
3636let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
3637let expected = bclass(&[]);
3638assert_eq!(expected, bintersect(&cls1, &cls2));
36393640let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3641let cls2 = bclass(&[(b'h', b'h')]);
3642let expected = bclass(&[(b'h', b'h')]);
3643assert_eq!(expected, bintersect(&cls1, &cls2));
36443645let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
3646let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
3647let expected = bclass(&[]);
3648assert_eq!(expected, bintersect(&cls1, &cls2));
36493650let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
3651let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
3652let expected = bclass(&[(b'b', b'f')]);
3653assert_eq!(expected, bintersect(&cls1, &cls2));
3654 }
36553656#[test]
3657fn class_difference_unicode() {
3658let cls1 = uclass(&[('a', 'a')]);
3659let cls2 = uclass(&[('a', 'a')]);
3660let expected = uclass(&[]);
3661assert_eq!(expected, udifference(&cls1, &cls2));
36623663let cls1 = uclass(&[('a', 'a')]);
3664let cls2 = uclass(&[]);
3665let expected = uclass(&[('a', 'a')]);
3666assert_eq!(expected, udifference(&cls1, &cls2));
36673668let cls1 = uclass(&[]);
3669let cls2 = uclass(&[('a', 'a')]);
3670let expected = uclass(&[]);
3671assert_eq!(expected, udifference(&cls1, &cls2));
36723673let cls1 = uclass(&[('a', 'z')]);
3674let cls2 = uclass(&[('a', 'a')]);
3675let expected = uclass(&[('b', 'z')]);
3676assert_eq!(expected, udifference(&cls1, &cls2));
36773678let cls1 = uclass(&[('a', 'z')]);
3679let cls2 = uclass(&[('z', 'z')]);
3680let expected = uclass(&[('a', 'y')]);
3681assert_eq!(expected, udifference(&cls1, &cls2));
36823683let cls1 = uclass(&[('a', 'z')]);
3684let cls2 = uclass(&[('m', 'm')]);
3685let expected = uclass(&[('a', 'l'), ('n', 'z')]);
3686assert_eq!(expected, udifference(&cls1, &cls2));
36873688let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3689let cls2 = uclass(&[('a', 'z')]);
3690let expected = uclass(&[]);
3691assert_eq!(expected, udifference(&cls1, &cls2));
36923693let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3694let cls2 = uclass(&[('d', 'v')]);
3695let expected = uclass(&[('a', 'c')]);
3696assert_eq!(expected, udifference(&cls1, &cls2));
36973698let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3699let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
3700let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3701assert_eq!(expected, udifference(&cls1, &cls2));
37023703let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3704let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
3705let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3706assert_eq!(expected, udifference(&cls1, &cls2));
37073708let cls1 = uclass(&[('x', 'z')]);
3709let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3710let expected = uclass(&[('x', 'z')]);
3711assert_eq!(expected, udifference(&cls1, &cls2));
37123713let cls1 = uclass(&[('a', 'z')]);
3714let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3715let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
3716assert_eq!(expected, udifference(&cls1, &cls2));
3717 }
37183719#[test]
3720fn class_difference_bytes() {
3721let cls1 = bclass(&[(b'a', b'a')]);
3722let cls2 = bclass(&[(b'a', b'a')]);
3723let expected = bclass(&[]);
3724assert_eq!(expected, bdifference(&cls1, &cls2));
37253726let cls1 = bclass(&[(b'a', b'a')]);
3727let cls2 = bclass(&[]);
3728let expected = bclass(&[(b'a', b'a')]);
3729assert_eq!(expected, bdifference(&cls1, &cls2));
37303731let cls1 = bclass(&[]);
3732let cls2 = bclass(&[(b'a', b'a')]);
3733let expected = bclass(&[]);
3734assert_eq!(expected, bdifference(&cls1, &cls2));
37353736let cls1 = bclass(&[(b'a', b'z')]);
3737let cls2 = bclass(&[(b'a', b'a')]);
3738let expected = bclass(&[(b'b', b'z')]);
3739assert_eq!(expected, bdifference(&cls1, &cls2));
37403741let cls1 = bclass(&[(b'a', b'z')]);
3742let cls2 = bclass(&[(b'z', b'z')]);
3743let expected = bclass(&[(b'a', b'y')]);
3744assert_eq!(expected, bdifference(&cls1, &cls2));
37453746let cls1 = bclass(&[(b'a', b'z')]);
3747let cls2 = bclass(&[(b'm', b'm')]);
3748let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
3749assert_eq!(expected, bdifference(&cls1, &cls2));
37503751let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3752let cls2 = bclass(&[(b'a', b'z')]);
3753let expected = bclass(&[]);
3754assert_eq!(expected, bdifference(&cls1, &cls2));
37553756let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3757let cls2 = bclass(&[(b'd', b'v')]);
3758let expected = bclass(&[(b'a', b'c')]);
3759assert_eq!(expected, bdifference(&cls1, &cls2));
37603761let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3762let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
3763let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3764assert_eq!(expected, bdifference(&cls1, &cls2));
37653766let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3767let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
3768let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3769assert_eq!(expected, bdifference(&cls1, &cls2));
37703771let cls1 = bclass(&[(b'x', b'z')]);
3772let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3773let expected = bclass(&[(b'x', b'z')]);
3774assert_eq!(expected, bdifference(&cls1, &cls2));
37753776let cls1 = bclass(&[(b'a', b'z')]);
3777let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3778let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
3779assert_eq!(expected, bdifference(&cls1, &cls2));
3780 }
37813782#[test]
3783fn class_symmetric_difference_unicode() {
3784let cls1 = uclass(&[('a', 'm')]);
3785let cls2 = uclass(&[('g', 't')]);
3786let expected = uclass(&[('a', 'f'), ('n', 't')]);
3787assert_eq!(expected, usymdifference(&cls1, &cls2));
3788 }
37893790#[test]
3791fn class_symmetric_difference_bytes() {
3792let cls1 = bclass(&[(b'a', b'm')]);
3793let cls2 = bclass(&[(b'g', b't')]);
3794let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
3795assert_eq!(expected, bsymdifference(&cls1, &cls2));
3796 }
37973798// We use a thread with an explicit stack size to test that our destructor
3799 // for Hir can handle arbitrarily sized expressions in constant stack
3800 // space. In case we run on a platform without threads (WASM?), we limit
3801 // this test to Windows/Unix.
3802#[test]
3803 #[cfg(any(unix, windows))]
3804fn no_stack_overflow_on_drop() {
3805use std::thread;
38063807let run = || {
3808let mut expr = Hir::empty();
3809for _ in 0..100 {
3810 expr = Hir::capture(Capture {
3811 index: 1,
3812 name: None,
3813 sub: Box::new(expr),
3814 });
3815 expr = Hir::repetition(Repetition {
3816 min: 0,
3817 max: Some(1),
3818 greedy: true,
3819 sub: Box::new(expr),
3820 });
38213822 expr = Hir {
3823 kind: HirKind::Concat(vec![expr]),
3824 props: Properties::empty(),
3825 };
3826 expr = Hir {
3827 kind: HirKind::Alternation(vec![expr]),
3828 props: Properties::empty(),
3829 };
3830 }
3831assert!(!matches!(*expr.kind(), HirKind::Empty));
3832 };
38333834// We run our test on a thread with a small stack size so we can
3835 // force the issue more easily.
3836 //
3837 // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
3838 // for context on the specific stack size chosen here.
3839thread::Builder::new()
3840 .stack_size(16 << 10)
3841 .spawn(run)
3842 .unwrap()
3843 .join()
3844 .unwrap();
3845 }
38463847#[test]
3848fn look_set_iter() {
3849let set = LookSet::empty();
3850assert_eq!(0, set.iter().count());
38513852let set = LookSet::full();
3853assert_eq!(18, set.iter().count());
38543855let set =
3856 LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
3857assert_eq!(2, set.iter().count());
38583859let set = LookSet::empty().insert(Look::StartLF);
3860assert_eq!(1, set.iter().count());
38613862let set = LookSet::empty().insert(Look::WordAsciiNegate);
3863assert_eq!(1, set.iter().count());
3864 }
38653866#[test]
3867fn look_set_debug() {
3868let res = format!("{:?}", LookSet::empty());
3869assert_eq!("∅", res);
3870let res = format!("{:?}", LookSet::full());
3871assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
3872 }
3873}