1/*!
2Provides routines for interpolating capture group references.
34That is, if a replacement string contains references like `$foo` or `${foo1}`,
5then they are replaced with the corresponding capture values for the groups
6named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}`
7is supported as well, with `1` corresponding to a capture group index and not
8a name.
910This module provides the free functions [`string`] and [`bytes`], which
11interpolate Rust Unicode strings and byte strings, respectively.
1213# Format
1415These routines support two different kinds of capture references: unbraced and
16braced.
1718For the unbraced format, the format supported is `$ref` where `name` can be
19any character in the class `[0-9A-Za-z_]`. `ref` is always the longest
20possible parse. So for example, `$1a` corresponds to the capture group named
21`1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then
22it is treated as a capture group index itself and not a name.
2324For the braced format, the format supported is `${ref}` where `ref` can be any
25sequence of bytes except for `}`. If no closing brace occurs, then it is not
26considered a capture reference. As with the unbraced format, if `ref` matches
27`^[0-9]+$`, then it is treated as a capture group index and not a name.
2829The braced format is useful for exerting precise control over the name of the
30capture reference. For example, `${1}a` corresponds to the capture group
31reference `1` followed by the letter `a`, where as `$1a` (as mentioned above)
32corresponds to the capture group reference `1a`. The braced format is also
33useful for expressing capture group names that use characters not supported by
34the unbraced format. For example, `${foo[bar].baz}` refers to the capture group
35named `foo[bar].baz`.
3637If a capture group reference is found and it does not refer to a valid capture
38group, then it will be replaced with the empty string.
3940To write a literal `$`, use `$$`.
4142To be clear, and as exhibited via the type signatures in the routines in this
43module, it is impossible for a replacement string to be invalid. A replacement
44string may not have the intended semantics, but the interpolation procedure
45itself can never fail.
46*/
4748use alloc::{string::String, vec::Vec};
4950use crate::util::memchr::memchr;
5152/// Accepts a replacement string and interpolates capture references with their
53/// corresponding values.
54///
55/// `append` should be a function that appends the string value of a capture
56/// group at a particular index to the string given. If the capture group
57/// index is invalid, then nothing should be appended.
58///
59/// `name_to_index` should be a function that maps a capture group name to a
60/// capture group index. If the given name doesn't exist, then `None` should
61/// be returned.
62///
63/// Finally, `dst` is where the final interpolated contents should be written.
64/// If `replacement` contains no capture group references, then `dst` will be
65/// equivalent to `replacement`.
66///
67/// See the [module documentation](self) for details about the format
68/// supported.
69///
70/// # Example
71///
72/// ```
73/// use regex_automata::util::interpolate;
74///
75/// let mut dst = String::new();
76/// interpolate::string(
77/// "foo $bar baz",
78/// |index, dst| {
79/// if index == 0 {
80/// dst.push_str("BAR");
81/// }
82/// },
83/// |name| {
84/// if name == "bar" {
85/// Some(0)
86/// } else {
87/// None
88/// }
89/// },
90/// &mut dst,
91/// );
92/// assert_eq!("foo BAR baz", dst);
93/// ```
94pub fn string(
95mut replacement: &str,
96mut append: impl FnMut(usize, &mut String),
97mut name_to_index: impl FnMut(&str) -> Option<usize>,
98 dst: &mut String,
99) {
100while !replacement.is_empty() {
101match memchr(b'$', replacement.as_bytes()) {
102None => break,
103Some(i) => {
104 dst.push_str(&replacement[..i]);
105 replacement = &replacement[i..];
106 }
107 }
108// Handle escaping of '$'.
109if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
110 dst.push_str("$");
111 replacement = &replacement[2..];
112continue;
113 }
114if true {
if !!replacement.is_empty() {
::core::panicking::panic("assertion failed: !replacement.is_empty()")
};
};debug_assert!(!replacement.is_empty());
115let cap_ref = match find_cap_ref(replacement.as_bytes()) {
116Some(cap_ref) => cap_ref,
117None => {
118 dst.push_str("$");
119 replacement = &replacement[1..];
120continue;
121 }
122 };
123 replacement = &replacement[cap_ref.end..];
124match cap_ref.cap {
125 Ref::Number(i) => append(i, dst),
126 Ref::Named(name) => {
127if let Some(i) = name_to_index(name) {
128 append(i, dst);
129 }
130 }
131 }
132 }
133dst.push_str(replacement);
134}
135136/// Accepts a replacement byte string and interpolates capture references with
137/// their corresponding values.
138///
139/// `append` should be a function that appends the byte string value of a
140/// capture group at a particular index to the byte string given. If the
141/// capture group index is invalid, then nothing should be appended.
142///
143/// `name_to_index` should be a function that maps a capture group name to a
144/// capture group index. If the given name doesn't exist, then `None` should
145/// be returned.
146///
147/// Finally, `dst` is where the final interpolated contents should be written.
148/// If `replacement` contains no capture group references, then `dst` will be
149/// equivalent to `replacement`.
150///
151/// See the [module documentation](self) for details about the format
152/// supported.
153///
154/// # Example
155///
156/// ```
157/// use regex_automata::util::interpolate;
158///
159/// let mut dst = vec![];
160/// interpolate::bytes(
161/// b"foo $bar baz",
162/// |index, dst| {
163/// if index == 0 {
164/// dst.extend_from_slice(b"BAR");
165/// }
166/// },
167/// |name| {
168/// if name == "bar" {
169/// Some(0)
170/// } else {
171/// None
172/// }
173/// },
174/// &mut dst,
175/// );
176/// assert_eq!(&b"foo BAR baz"[..], dst);
177/// ```
178pub fn bytes(
179mut replacement: &[u8],
180mut append: impl FnMut(usize, &mut Vec<u8>),
181mut name_to_index: impl FnMut(&str) -> Option<usize>,
182 dst: &mut Vec<u8>,
183) {
184while !replacement.is_empty() {
185match memchr(b'$', replacement) {
186None => break,
187Some(i) => {
188 dst.extend_from_slice(&replacement[..i]);
189 replacement = &replacement[i..];
190 }
191 }
192// Handle escaping of '$'.
193if replacement.get(1).map_or(false, |&b| b == b'$') {
194 dst.push(b'$');
195 replacement = &replacement[2..];
196continue;
197 }
198if true {
if !!replacement.is_empty() {
::core::panicking::panic("assertion failed: !replacement.is_empty()")
};
};debug_assert!(!replacement.is_empty());
199let cap_ref = match find_cap_ref(replacement) {
200Some(cap_ref) => cap_ref,
201None => {
202 dst.push(b'$');
203 replacement = &replacement[1..];
204continue;
205 }
206 };
207 replacement = &replacement[cap_ref.end..];
208match cap_ref.cap {
209 Ref::Number(i) => append(i, dst),
210 Ref::Named(name) => {
211if let Some(i) = name_to_index(name) {
212 append(i, dst);
213 }
214 }
215 }
216 }
217dst.extend_from_slice(replacement);
218}
219220/// `CaptureRef` represents a reference to a capture group inside some text.
221/// The reference is either a capture group name or a number.
222///
223/// It is also tagged with the position in the text following the
224/// capture reference.
225#[derive(#[automatically_derived]
impl<'a> ::core::clone::Clone for CaptureRef<'a> {
#[inline]
fn clone(&self) -> CaptureRef<'a> {
let _: ::core::clone::AssertParamIsClone<Ref<'a>>;
let _: ::core::clone::AssertParamIsClone<usize>;
*self
}
}Clone, #[automatically_derived]
impl<'a> ::core::marker::Copy for CaptureRef<'a> { }Copy, #[automatically_derived]
impl<'a> ::core::fmt::Debug for CaptureRef<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field2_finish(f, "CaptureRef",
"cap", &self.cap, "end", &&self.end)
}
}Debug, #[automatically_derived]
impl<'a> ::core::cmp::Eq for CaptureRef<'a> {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<Ref<'a>>;
let _: ::core::cmp::AssertParamIsEq<usize>;
}
}Eq, #[automatically_derived]
impl<'a> ::core::cmp::PartialEq for CaptureRef<'a> {
#[inline]
fn eq(&self, other: &CaptureRef<'a>) -> bool {
self.cap == other.cap && self.end == other.end
}
}PartialEq)]
226struct CaptureRef<'a> {
227 cap: Ref<'a>,
228 end: usize,
229}
230231/// A reference to a capture group in some text.
232///
233/// e.g., `$2`, `$foo`, `${foo}`.
234#[derive(#[automatically_derived]
impl<'a> ::core::clone::Clone for Ref<'a> {
#[inline]
fn clone(&self) -> Ref<'a> {
let _: ::core::clone::AssertParamIsClone<&'a str>;
let _: ::core::clone::AssertParamIsClone<usize>;
*self
}
}Clone, #[automatically_derived]
impl<'a> ::core::marker::Copy for Ref<'a> { }Copy, #[automatically_derived]
impl<'a> ::core::fmt::Debug for Ref<'a> {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
match self {
Ref::Named(__self_0) =>
::core::fmt::Formatter::debug_tuple_field1_finish(f, "Named",
&__self_0),
Ref::Number(__self_0) =>
::core::fmt::Formatter::debug_tuple_field1_finish(f, "Number",
&__self_0),
}
}
}Debug, #[automatically_derived]
impl<'a> ::core::cmp::Eq for Ref<'a> {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<&'a str>;
let _: ::core::cmp::AssertParamIsEq<usize>;
}
}Eq, #[automatically_derived]
impl<'a> ::core::cmp::PartialEq for Ref<'a> {
#[inline]
fn eq(&self, other: &Ref<'a>) -> bool {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
__self_discr == __arg1_discr &&
match (self, other) {
(Ref::Named(__self_0), Ref::Named(__arg1_0)) =>
__self_0 == __arg1_0,
(Ref::Number(__self_0), Ref::Number(__arg1_0)) =>
__self_0 == __arg1_0,
_ => unsafe { ::core::intrinsics::unreachable() }
}
}
}PartialEq)]
235enum Ref<'a> {
236 Named(&'a str),
237 Number(usize),
238}
239240impl<'a> From<&'a str> for Ref<'a> {
241fn from(x: &'a str) -> Ref<'a> {
242 Ref::Named(x)
243 }
244}
245246impl From<usize> for Ref<'static> {
247fn from(x: usize) -> Ref<'static> {
248 Ref::Number(x)
249 }
250}
251252/// Parses a possible reference to a capture group name in the given text,
253/// starting at the beginning of `replacement`.
254///
255/// If no such valid reference could be found, None is returned.
256///
257/// Note that this returns a "possible" reference because this routine doesn't
258/// know whether the reference is to a valid group or not. If it winds up not
259/// being a valid reference, then it should be replaced with the empty string.
260fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> {
261let mut i = 0;
262let rep: &[u8] = replacement;
263if rep.len() <= 1 || rep[0] != b'$' {
264return None;
265 }
266i += 1;
267if rep[i] == b'{' {
268return find_cap_ref_braced(rep, i + 1);
269 }
270let mut cap_end = i;
271while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) {
272 cap_end += 1;
273 }
274if cap_end == i {
275return None;
276 }
277// We just verified that the range 0..cap_end is valid ASCII, so it must
278 // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
279 // check via an unchecked conversion or by parsing the number straight from
280 // &[u8].
281let cap = core::str::from_utf8(&rep[i..cap_end])
282 .expect("valid UTF-8 capture name");
283Some(CaptureRef {
284 cap: match cap.parse::<usize>() {
285Ok(i) => Ref::Number(i),
286Err(_) => Ref::Named(cap),
287 },
288 end: cap_end,
289 })
290}
291292/// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening
293/// brace has been found at `i-1` in `rep`. This then looks for a closing
294/// brace and returns the capture reference within the brace.
295fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
296match (&b'{', &rep[i.checked_sub(1).unwrap()]) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val, &*right_val,
::core::option::Option::None);
}
}
};assert_eq!(b'{', rep[i.checked_sub(1).unwrap()]);
297let start = i;
298while rep.get(i).map_or(false, |&b| b != b'}') {
299 i += 1;
300 }
301if !rep.get(i).map_or(false, |&b| b == b'}') {
302return None;
303 }
304// When looking at braced names, we don't put any restrictions on the name,
305 // so it's possible it could be invalid UTF-8. But a capture group name
306 // can never be invalid UTF-8, so if we have invalid UTF-8, then we can
307 // safely return None.
308let cap = match core::str::from_utf8(&rep[start..i]) {
309Err(_) => return None,
310Ok(cap) => cap,
311 };
312Some(CaptureRef {
313 cap: match cap.parse::<usize>() {
314Ok(i) => Ref::Number(i),
315Err(_) => Ref::Named(cap),
316 },
317 end: i + 1,
318 })
319}
320321/// Returns true if and only if the given byte is allowed in a capture name
322/// written in non-brace form.
323fn is_valid_cap_letter(b: u8) -> bool {
324#[allow(non_exhaustive_omitted_patterns)] match b {
b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
_ => false,
}matches!(b, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_')325}
326327#[cfg(test)]
328mod tests {
329use alloc::{string::String, vec, vec::Vec};
330331use super::{find_cap_ref, CaptureRef};
332333macro_rules! find {
334 ($name:ident, $text:expr) => {
335#[test]
336fn $name() {
337assert_eq!(None, find_cap_ref($text.as_bytes()));
338 }
339 };
340 ($name:ident, $text:expr, $capref:expr) => {
341#[test]
342fn $name() {
343assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
344 }
345 };
346 }
347348macro_rules! c {
349 ($name_or_number:expr, $pos:expr) => {
350 CaptureRef { cap: $name_or_number.into(), end: $pos }
351 };
352 }
353354find!(find_cap_ref1, "$foo", c!("foo", 4));
355find!(find_cap_ref2, "${foo}", c!("foo", 6));
356find!(find_cap_ref3, "$0", c!(0, 2));
357find!(find_cap_ref4, "$5", c!(5, 2));
358find!(find_cap_ref5, "$10", c!(10, 3));
359// See https://github.com/rust-lang/regex/pull/585
360 // for more on characters following numbers
361find!(find_cap_ref6, "$42a", c!("42a", 4));
362find!(find_cap_ref7, "${42}a", c!(42, 5));
363find!(find_cap_ref8, "${42");
364find!(find_cap_ref9, "${42 ");
365find!(find_cap_ref10, " $0 ");
366find!(find_cap_ref11, "$");
367find!(find_cap_ref12, " ");
368find!(find_cap_ref13, "");
369find!(find_cap_ref14, "$1-$2", c!(1, 2));
370find!(find_cap_ref15, "$1_$2", c!("1_", 3));
371find!(find_cap_ref16, "$x-$y", c!("x", 2));
372find!(find_cap_ref17, "$x_$y", c!("x_", 3));
373find!(find_cap_ref18, "${#}", c!("#", 4));
374find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
375find!(find_cap_ref20, "${¾}", c!("¾", 5));
376find!(find_cap_ref21, "${¾a}", c!("¾a", 6));
377find!(find_cap_ref22, "${a¾}", c!("a¾", 6));
378find!(find_cap_ref23, "${☃}", c!("☃", 6));
379find!(find_cap_ref24, "${a☃}", c!("a☃", 7));
380find!(find_cap_ref25, "${☃a}", c!("☃a", 7));
381find!(find_cap_ref26, "${名字}", c!("名字", 9));
382383fn interpolate_string(
384mut name_to_index: Vec<(&'static str, usize)>,
385 caps: Vec<&'static str>,
386 replacement: &str,
387 ) -> String {
388 name_to_index.sort_by_key(|x| x.0);
389390let mut dst = String::new();
391super::string(
392 replacement,
393 |i, dst| {
394if let Some(&s) = caps.get(i) {
395 dst.push_str(s);
396 }
397 },
398 |name| -> Option<usize> {
399 name_to_index
400 .binary_search_by_key(&name, |x| x.0)
401 .ok()
402 .map(|i| name_to_index[i].1)
403 },
404&mut dst,
405 );
406 dst
407 }
408409fn interpolate_bytes(
410mut name_to_index: Vec<(&'static str, usize)>,
411 caps: Vec<&'static str>,
412 replacement: &str,
413 ) -> String {
414 name_to_index.sort_by_key(|x| x.0);
415416let mut dst = vec![];
417super::bytes(
418 replacement.as_bytes(),
419 |i, dst| {
420if let Some(&s) = caps.get(i) {
421 dst.extend_from_slice(s.as_bytes());
422 }
423 },
424 |name| -> Option<usize> {
425 name_to_index
426 .binary_search_by_key(&name, |x| x.0)
427 .ok()
428 .map(|i| name_to_index[i].1)
429 },
430&mut dst,
431 );
432 String::from_utf8(dst).unwrap()
433 }
434435macro_rules! interp {
436 ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => {
437#[test]
438fn $name() {
439assert_eq!(
440$expected,
441 interpolate_string($map, $caps, $hay),
442"interpolate::string failed",
443 );
444assert_eq!(
445$expected,
446 interpolate_bytes($map, $caps, $hay),
447"interpolate::bytes failed",
448 );
449 }
450 };
451 }
452453interp!(
454 interp1,
455vec![("foo", 2)],
456vec!["", "", "xxx"],
457"test $foo test",
458"test xxx test",
459 );
460461interp!(
462 interp2,
463vec![("foo", 2)],
464vec!["", "", "xxx"],
465"test$footest",
466"test",
467 );
468469interp!(
470 interp3,
471vec![("foo", 2)],
472vec!["", "", "xxx"],
473"test${foo}test",
474"testxxxtest",
475 );
476477interp!(
478 interp4,
479vec![("foo", 2)],
480vec!["", "", "xxx"],
481"test$2test",
482"test",
483 );
484485interp!(
486 interp5,
487vec![("foo", 2)],
488vec!["", "", "xxx"],
489"test${2}test",
490"testxxxtest",
491 );
492493interp!(
494 interp6,
495vec![("foo", 2)],
496vec!["", "", "xxx"],
497"test $$foo test",
498"test $foo test",
499 );
500501interp!(
502 interp7,
503vec![("foo", 2)],
504vec!["", "", "xxx"],
505"test $foo",
506"test xxx",
507 );
508509interp!(
510 interp8,
511vec![("foo", 2)],
512vec!["", "", "xxx"],
513"$foo test",
514"xxx test",
515 );
516517interp!(
518 interp9,
519vec![("bar", 1), ("foo", 2)],
520vec!["", "yyy", "xxx"],
521"test $bar$foo",
522"test yyyxxx",
523 );
524525interp!(
526 interp10,
527vec![("bar", 1), ("foo", 2)],
528vec!["", "yyy", "xxx"],
529"test $ test",
530"test $ test",
531 );
532533interp!(
534 interp11,
535vec![("bar", 1), ("foo", 2)],
536vec!["", "yyy", "xxx"],
537"test ${} test",
538"test test",
539 );
540541interp!(
542 interp12,
543vec![("bar", 1), ("foo", 2)],
544vec!["", "yyy", "xxx"],
545"test ${ } test",
546"test test",
547 );
548549interp!(
550 interp13,
551vec![("bar", 1), ("foo", 2)],
552vec!["", "yyy", "xxx"],
553"test ${a b} test",
554"test test",
555 );
556557interp!(
558 interp14,
559vec![("bar", 1), ("foo", 2)],
560vec!["", "yyy", "xxx"],
561"test ${a} test",
562"test test",
563 );
564565// This is a funny case where a braced reference is never closed, but
566 // within the unclosed braced reference, there is an unbraced reference.
567 // In this case, the braced reference is just treated literally and the
568 // unbraced reference is found.
569interp!(
570 interp15,
571vec![("bar", 1), ("foo", 2)],
572vec!["", "yyy", "xxx"],
573"test ${wat $bar ok",
574"test ${wat yyy ok",
575 );
576}