1use core::{char, fmt, iter::FusedIterator, slice};
84
85use alloc::{vec, vec::Vec};
86
87const MAX_UTF8_BYTES: usize = 4;
88
89#[derive(#[automatically_derived]
impl ::core::marker::Copy for Utf8Sequence { }Copy, #[automatically_derived]
impl ::core::clone::Clone for Utf8Sequence {
#[inline]
fn clone(&self) -> Utf8Sequence {
let _: ::core::clone::AssertParamIsClone<Utf8Range>;
let _: ::core::clone::AssertParamIsClone<[Utf8Range; 2]>;
let _: ::core::clone::AssertParamIsClone<[Utf8Range; 3]>;
let _: ::core::clone::AssertParamIsClone<[Utf8Range; 4]>;
*self
}
}Clone, #[automatically_derived]
impl ::core::cmp::Eq for Utf8Sequence {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<Utf8Range>;
let _: ::core::cmp::AssertParamIsEq<[Utf8Range; 2]>;
let _: ::core::cmp::AssertParamIsEq<[Utf8Range; 3]>;
let _: ::core::cmp::AssertParamIsEq<[Utf8Range; 4]>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for Utf8Sequence {
#[inline]
fn eq(&self, other: &Utf8Sequence) -> bool {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
__self_discr == __arg1_discr &&
match (self, other) {
(Utf8Sequence::One(__self_0), Utf8Sequence::One(__arg1_0)) =>
__self_0 == __arg1_0,
(Utf8Sequence::Two(__self_0), Utf8Sequence::Two(__arg1_0)) =>
__self_0 == __arg1_0,
(Utf8Sequence::Three(__self_0), Utf8Sequence::Three(__arg1_0))
=> __self_0 == __arg1_0,
(Utf8Sequence::Four(__self_0), Utf8Sequence::Four(__arg1_0))
=> __self_0 == __arg1_0,
_ => unsafe { ::core::intrinsics::unreachable() }
}
}
}PartialEq, #[automatically_derived]
impl ::core::cmp::PartialOrd for Utf8Sequence {
#[inline]
fn partial_cmp(&self, other: &Utf8Sequence)
-> ::core::option::Option<::core::cmp::Ordering> {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
match (self, other) {
(Utf8Sequence::One(__self_0), Utf8Sequence::One(__arg1_0)) =>
::core::cmp::PartialOrd::partial_cmp(__self_0, __arg1_0),
(Utf8Sequence::Two(__self_0), Utf8Sequence::Two(__arg1_0)) =>
::core::cmp::PartialOrd::partial_cmp(__self_0, __arg1_0),
(Utf8Sequence::Three(__self_0), Utf8Sequence::Three(__arg1_0)) =>
::core::cmp::PartialOrd::partial_cmp(__self_0, __arg1_0),
(Utf8Sequence::Four(__self_0), Utf8Sequence::Four(__arg1_0)) =>
::core::cmp::PartialOrd::partial_cmp(__self_0, __arg1_0),
_ =>
::core::cmp::PartialOrd::partial_cmp(&__self_discr,
&__arg1_discr),
}
}
}PartialOrd, #[automatically_derived]
impl ::core::cmp::Ord for Utf8Sequence {
#[inline]
fn cmp(&self, other: &Utf8Sequence) -> ::core::cmp::Ordering {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
match ::core::cmp::Ord::cmp(&__self_discr, &__arg1_discr) {
::core::cmp::Ordering::Equal =>
match (self, other) {
(Utf8Sequence::One(__self_0), Utf8Sequence::One(__arg1_0))
=> ::core::cmp::Ord::cmp(__self_0, __arg1_0),
(Utf8Sequence::Two(__self_0), Utf8Sequence::Two(__arg1_0))
=> ::core::cmp::Ord::cmp(__self_0, __arg1_0),
(Utf8Sequence::Three(__self_0),
Utf8Sequence::Three(__arg1_0)) =>
::core::cmp::Ord::cmp(__self_0, __arg1_0),
(Utf8Sequence::Four(__self_0), Utf8Sequence::Four(__arg1_0))
=> ::core::cmp::Ord::cmp(__self_0, __arg1_0),
_ => unsafe { ::core::intrinsics::unreachable() }
},
cmp => cmp,
}
}
}Ord)]
97pub enum Utf8Sequence {
98 One(Utf8Range),
100 Two([Utf8Range; 2]),
102 Three([Utf8Range; 3]),
104 Four([Utf8Range; 4]),
106}
107
108impl Utf8Sequence {
109 fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
114 match (&start.len(), &end.len()) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val, &*right_val,
::core::option::Option::None);
}
}
};assert_eq!(start.len(), end.len());
115 match start.len() {
116 2 => Utf8Sequence::Two([
117 Utf8Range::new(start[0], end[0]),
118 Utf8Range::new(start[1], end[1]),
119 ]),
120 3 => Utf8Sequence::Three([
121 Utf8Range::new(start[0], end[0]),
122 Utf8Range::new(start[1], end[1]),
123 Utf8Range::new(start[2], end[2]),
124 ]),
125 4 => Utf8Sequence::Four([
126 Utf8Range::new(start[0], end[0]),
127 Utf8Range::new(start[1], end[1]),
128 Utf8Range::new(start[2], end[2]),
129 Utf8Range::new(start[3], end[3]),
130 ]),
131 n => {
::core::panicking::panic_fmt(format_args!("internal error: entered unreachable code: {0}",
format_args!("invalid encoded length: {0}", n)));
}unreachable!("invalid encoded length: {n}"),
132 }
133 }
134
135 pub fn as_slice(&self) -> &[Utf8Range] {
137 use self::Utf8Sequence::*;
138 match *self {
139 One(ref r) => slice::from_ref(r),
140 Two(ref r) => &r[..],
141 Three(ref r) => &r[..],
142 Four(ref r) => &r[..],
143 }
144 }
145
146 pub fn len(&self) -> usize {
150 self.as_slice().len()
151 }
152
153 pub fn reverse(&mut self) {
170 match *self {
171 Utf8Sequence::One(_) => {}
172 Utf8Sequence::Two(ref mut x) => x.reverse(),
173 Utf8Sequence::Three(ref mut x) => x.reverse(),
174 Utf8Sequence::Four(ref mut x) => x.reverse(),
175 }
176 }
177
178 pub fn matches(&self, bytes: &[u8]) -> bool {
181 if bytes.len() < self.len() {
182 return false;
183 }
184 for (&b, r) in bytes.iter().zip(self) {
185 if !r.matches(b) {
186 return false;
187 }
188 }
189 true
190 }
191}
192
193impl<'a> IntoIterator for &'a Utf8Sequence {
194 type IntoIter = slice::Iter<'a, Utf8Range>;
195 type Item = &'a Utf8Range;
196
197 fn into_iter(self) -> Self::IntoIter {
198 self.as_slice().iter()
199 }
200}
201
202impl fmt::Debug for Utf8Sequence {
203 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
204 use self::Utf8Sequence::*;
205 match *self {
206 One(ref r) => f.write_fmt(format_args!("{0:?}", r))write!(f, "{r:?}"),
207 Two(ref r) => f.write_fmt(format_args!("{0:?}{1:?}", r[0], r[1]))write!(f, "{:?}{:?}", r[0], r[1]),
208 Three(ref r) => f.write_fmt(format_args!("{0:?}{1:?}{2:?}", r[0], r[1], r[2]))write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
209 Four(ref r) => {
210 f.write_fmt(format_args!("{0:?}{1:?}{2:?}{3:?}", r[0], r[1], r[2], r[3]))write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3])
211 }
212 }
213 }
214}
215
216#[derive(#[automatically_derived]
impl ::core::clone::Clone for Utf8Range {
#[inline]
fn clone(&self) -> Utf8Range {
let _: ::core::clone::AssertParamIsClone<u8>;
*self
}
}Clone, #[automatically_derived]
impl ::core::marker::Copy for Utf8Range { }Copy, #[automatically_derived]
impl ::core::cmp::Eq for Utf8Range {
#[inline]
#[doc(hidden)]
#[coverage(off)]
fn assert_fields_are_eq(&self) {
let _: ::core::cmp::AssertParamIsEq<u8>;
}
}Eq, #[automatically_derived]
impl ::core::cmp::PartialEq for Utf8Range {
#[inline]
fn eq(&self, other: &Utf8Range) -> bool {
self.start == other.start && self.end == other.end
}
}PartialEq, #[automatically_derived]
impl ::core::cmp::PartialOrd for Utf8Range {
#[inline]
fn partial_cmp(&self, other: &Utf8Range)
-> ::core::option::Option<::core::cmp::Ordering> {
match ::core::cmp::PartialOrd::partial_cmp(&self.start, &other.start)
{
::core::option::Option::Some(::core::cmp::Ordering::Equal) =>
::core::cmp::PartialOrd::partial_cmp(&self.end, &other.end),
cmp => cmp,
}
}
}PartialOrd, #[automatically_derived]
impl ::core::cmp::Ord for Utf8Range {
#[inline]
fn cmp(&self, other: &Utf8Range) -> ::core::cmp::Ordering {
match ::core::cmp::Ord::cmp(&self.start, &other.start) {
::core::cmp::Ordering::Equal =>
::core::cmp::Ord::cmp(&self.end, &other.end),
cmp => cmp,
}
}
}Ord)]
218pub struct Utf8Range {
219 pub start: u8,
221 pub end: u8,
223}
224
225impl Utf8Range {
226 fn new(start: u8, end: u8) -> Self {
227 Utf8Range { start, end }
228 }
229
230 pub fn matches(&self, b: u8) -> bool {
232 self.start <= b && b <= self.end
233 }
234}
235
236impl fmt::Debug for Utf8Range {
237 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
238 if self.start == self.end {
239 f.write_fmt(format_args!("[{0:X}]", self.start))write!(f, "[{:X}]", self.start)
240 } else {
241 f.write_fmt(format_args!("[{0:X}-{1:X}]", self.start, self.end))write!(f, "[{:X}-{:X}]", self.start, self.end)
242 }
243 }
244}
245
246#[derive(#[automatically_derived]
impl ::core::fmt::Debug for Utf8Sequences {
#[inline]
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
::core::fmt::Formatter::debug_struct_field1_finish(f, "Utf8Sequences",
"range_stack", &&self.range_stack)
}
}Debug)]
297pub struct Utf8Sequences {
298 range_stack: Vec<ScalarRange>,
299}
300
301impl Utf8Sequences {
302 pub fn new(start: char, end: char) -> Self {
305 let range =
306 ScalarRange { start: u32::from(start), end: u32::from(end) };
307 Utf8Sequences { range_stack: ::alloc::boxed::box_assume_init_into_vec_unsafe(::alloc::intrinsics::write_box_via_move(::alloc::boxed::Box::new_uninit(),
[range]))vec![range] }
308 }
309
310 #[doc(hidden)]
315 pub fn reset(&mut self, start: char, end: char) {
316 self.range_stack.clear();
317 self.push(u32::from(start), u32::from(end));
318 }
319
320 fn push(&mut self, start: u32, end: u32) {
321 self.range_stack.push(ScalarRange { start, end });
322 }
323}
324
325struct ScalarRange {
326 start: u32,
327 end: u32,
328}
329
330impl fmt::Debug for ScalarRange {
331 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
332 f.write_fmt(format_args!("ScalarRange({0:X}, {1:X})", self.start, self.end))write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
333 }
334}
335
336impl Iterator for Utf8Sequences {
337 type Item = Utf8Sequence;
338
339 fn next(&mut self) -> Option<Self::Item> {
340 'TOP: while let Some(mut r) = self.range_stack.pop() {
341 'INNER: loop {
342 if let Some((r1, r2)) = r.split() {
343 self.push(r2.start, r2.end);
344 r.start = r1.start;
345 r.end = r1.end;
346 continue 'INNER;
347 }
348 if !r.is_valid() {
349 continue 'TOP;
350 }
351 for i in 1..MAX_UTF8_BYTES {
352 let max = max_scalar_value(i);
353 if r.start <= max && max < r.end {
354 self.push(max + 1, r.end);
355 r.end = max;
356 continue 'INNER;
357 }
358 }
359 if let Some(ascii_range) = r.as_ascii() {
360 return Some(Utf8Sequence::One(ascii_range));
361 }
362 for i in 1..MAX_UTF8_BYTES {
363 let m = (1 << (6 * i)) - 1;
364 if (r.start & !m) != (r.end & !m) {
365 if (r.start & m) != 0 {
366 self.push((r.start | m) + 1, r.end);
367 r.end = r.start | m;
368 continue 'INNER;
369 }
370 if (r.end & m) != m {
371 self.push(r.end & !m, r.end);
372 r.end = (r.end & !m) - 1;
373 continue 'INNER;
374 }
375 }
376 }
377 let mut start = [0; MAX_UTF8_BYTES];
378 let mut end = [0; MAX_UTF8_BYTES];
379 let n = r.encode(&mut start, &mut end);
380 return Some(Utf8Sequence::from_encoded_range(
381 &start[0..n],
382 &end[0..n],
383 ));
384 }
385 }
386 None
387 }
388}
389
390impl FusedIterator for Utf8Sequences {}
391
392impl ScalarRange {
393 fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
397 if self.start < 0xE000 && self.end > 0xD7FF {
398 Some((
399 ScalarRange { start: self.start, end: 0xD7FF },
400 ScalarRange { start: 0xE000, end: self.end },
401 ))
402 } else {
403 None
404 }
405 }
406
407 fn is_valid(&self) -> bool {
409 self.start <= self.end
410 }
411
412 fn as_ascii(&self) -> Option<Utf8Range> {
415 if self.is_ascii() {
416 let start = u8::try_from(self.start).unwrap();
417 let end = u8::try_from(self.end).unwrap();
418 Some(Utf8Range::new(start, end))
419 } else {
420 None
421 }
422 }
423
424 fn is_ascii(&self) -> bool {
427 self.is_valid() && self.end <= 0x7f
428 }
429
430 fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
436 let cs = char::from_u32(self.start).unwrap();
437 let ce = char::from_u32(self.end).unwrap();
438 let ss = cs.encode_utf8(start);
439 let se = ce.encode_utf8(end);
440 match (&ss.len(), &se.len()) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val, &*right_val,
::core::option::Option::None);
}
}
};assert_eq!(ss.len(), se.len());
441 ss.len()
442 }
443}
444
445fn max_scalar_value(nbytes: usize) -> u32 {
446 match nbytes {
447 1 => 0x007F,
448 2 => 0x07FF,
449 3 => 0xFFFF,
450 4 => 0x0010_FFFF,
451 _ => {
::core::panicking::panic_fmt(format_args!("internal error: entered unreachable code: {0}",
format_args!("invalid UTF-8 byte sequence size")));
}unreachable!("invalid UTF-8 byte sequence size"),
452 }
453}
454
455#[cfg(test)]
456mod tests {
457 use core::char;
458
459 use alloc::{vec, vec::Vec};
460
461 use crate::utf8::{Utf8Range, Utf8Sequences};
462
463 fn rutf8(s: u8, e: u8) -> Utf8Range {
464 Utf8Range::new(s, e)
465 }
466
467 fn never_accepts_surrogate_codepoints(start: char, end: char) {
468 for cp in 0xD800..0xE000 {
469 let buf = encode_surrogate(cp);
470 for r in Utf8Sequences::new(start, end) {
471 if r.matches(&buf) {
472 panic!(
473 "Sequence ({:X}, {:X}) contains range {:?}, \
474 which matches surrogate code point {:X} \
475 with encoded bytes {:?}",
476 u32::from(start),
477 u32::from(end),
478 r,
479 cp,
480 buf,
481 );
482 }
483 }
484 }
485 }
486
487 #[test]
488 fn codepoints_no_surrogates() {
489 never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
490 never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
491 never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
492 never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
493 never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
494 }
495
496 #[test]
497 fn single_codepoint_one_sequence() {
498 for i in 0x0..=0x0010_FFFF {
501 let c = match char::from_u32(i) {
502 None => continue,
503 Some(c) => c,
504 };
505 let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
506 assert_eq!(seqs.len(), 1);
507 }
508 }
509
510 #[test]
511 fn bmp() {
512 use crate::utf8::Utf8Sequence::*;
513
514 let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>();
515 assert_eq!(
516 seqs,
517 vec![
518 One(rutf8(0x0, 0x7F)),
519 Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
520 Three([
521 rutf8(0xE0, 0xE0),
522 rutf8(0xA0, 0xBF),
523 rutf8(0x80, 0xBF)
524 ]),
525 Three([
526 rutf8(0xE1, 0xEC),
527 rutf8(0x80, 0xBF),
528 rutf8(0x80, 0xBF)
529 ]),
530 Three([
531 rutf8(0xED, 0xED),
532 rutf8(0x80, 0x9F),
533 rutf8(0x80, 0xBF)
534 ]),
535 Three([
536 rutf8(0xEE, 0xEF),
537 rutf8(0x80, 0xBF),
538 rutf8(0x80, 0xBF)
539 ]),
540 ]
541 );
542 }
543
544 #[test]
545 fn reverse() {
546 use crate::utf8::Utf8Sequence::*;
547
548 let mut s = One(rutf8(0xA, 0xB));
549 s.reverse();
550 assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]);
551
552 let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]);
553 s.reverse();
554 assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]);
555
556 let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]);
557 s.reverse();
558 assert_eq!(
559 s.as_slice(),
560 &[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)]
561 );
562
563 let mut s = Four([
564 rutf8(0xA, 0xB),
565 rutf8(0xB, 0xC),
566 rutf8(0xC, 0xD),
567 rutf8(0xD, 0xE),
568 ]);
569 s.reverse();
570 assert_eq!(
571 s.as_slice(),
572 &[
573 rutf8(0xD, 0xE),
574 rutf8(0xC, 0xD),
575 rutf8(0xB, 0xC),
576 rutf8(0xA, 0xB)
577 ]
578 );
579 }
580
581 fn encode_surrogate(cp: u32) -> [u8; 3] {
582 const TAG_CONT: u8 = 0b1000_0000;
583 const TAG_THREE_B: u8 = 0b1110_0000;
584
585 assert!(0xD800 <= cp && cp < 0xE000);
586 let mut dst = [0; 3];
587 dst[0] = u8::try_from(cp >> 12 & 0x0F).unwrap() | TAG_THREE_B;
588 dst[1] = u8::try_from(cp >> 6 & 0x3F).unwrap() | TAG_CONT;
589 dst[2] = u8::try_from(cp & 0x3F).unwrap() | TAG_CONT;
590 dst
591 }
592}