utf8_iter/
report.rs
1use crate::in_inclusive_range8;
18use crate::UTF8_DATA;
19use core::fmt::Formatter;
20use core::iter::FusedIterator;
21
22#[derive(Debug, PartialEq)]
29#[non_exhaustive]
30pub struct Utf8CharsError;
31
32impl core::fmt::Display for Utf8CharsError {
33 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), core::fmt::Error> {
34 write!(f, "byte sequence not well-formed UTF-8")
35 }
36}
37
38#[derive(Debug, Clone)]
49pub struct ErrorReportingUtf8Chars<'a> {
50 remaining: &'a [u8],
51}
52
53impl<'a> ErrorReportingUtf8Chars<'a> {
54 #[inline(always)]
55 pub fn new(bytes: &'a [u8]) -> Self {
57 ErrorReportingUtf8Chars::<'a> { remaining: bytes }
58 }
59
60 #[inline(always)]
63 pub fn as_slice(&self) -> &'a [u8] {
64 self.remaining
65 }
66
67 #[inline(never)]
68 fn next_fallback(&mut self) -> Option<Result<char, Utf8CharsError>> {
69 if self.remaining.is_empty() {
70 return None;
71 }
72 let first = self.remaining[0];
73 if first < 0x80 {
74 self.remaining = &self.remaining[1..];
75 return Some(Ok(char::from(first)));
76 }
77 if !in_inclusive_range8(first, 0xC2, 0xF4) || self.remaining.len() == 1 {
78 self.remaining = &self.remaining[1..];
79 return Some(Err(Utf8CharsError));
80 }
81 let second = self.remaining[1];
82 let (lower_bound, upper_bound) = match first {
83 0xE0 => (0xA0, 0xBF),
84 0xED => (0x80, 0x9F),
85 0xF0 => (0x90, 0xBF),
86 0xF4 => (0x80, 0x8F),
87 _ => (0x80, 0xBF),
88 };
89 if !in_inclusive_range8(second, lower_bound, upper_bound) {
90 self.remaining = &self.remaining[1..];
91 return Some(Err(Utf8CharsError));
92 }
93 if first < 0xE0 {
94 self.remaining = &self.remaining[2..];
95 let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
96 return Some(Ok(unsafe { char::from_u32_unchecked(point) }));
97 }
98 if self.remaining.len() == 2 {
99 self.remaining = &self.remaining[2..];
100 return Some(Err(Utf8CharsError));
101 }
102 let third = self.remaining[2];
103 if !in_inclusive_range8(third, 0x80, 0xBF) {
104 self.remaining = &self.remaining[2..];
105 return Some(Err(Utf8CharsError));
106 }
107 if first < 0xF0 {
108 self.remaining = &self.remaining[3..];
109 let point = ((u32::from(first) & 0xF) << 12)
110 | ((u32::from(second) & 0x3F) << 6)
111 | (u32::from(third) & 0x3F);
112 return Some(Ok(unsafe { char::from_u32_unchecked(point) }));
113 }
114 self.remaining = &self.remaining[3..];
118 Some(Err(Utf8CharsError))
119 }
120}
121
122impl<'a> Iterator for ErrorReportingUtf8Chars<'a> {
123 type Item = Result<char, Utf8CharsError>;
124
125 #[inline]
126 fn next(&mut self) -> Option<Result<char, Utf8CharsError>> {
127 #[allow(clippy::never_loop)]
129 loop {
130 if self.remaining.len() < 4 {
131 break;
132 }
133 let first = self.remaining[0];
134 if first < 0x80 {
135 self.remaining = &self.remaining[1..];
136 return Some(Ok(char::from(first)));
137 }
138 let second = self.remaining[1];
139 if in_inclusive_range8(first, 0xC2, 0xDF) {
140 if !in_inclusive_range8(second, 0x80, 0xBF) {
141 break;
142 }
143 let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
144 self.remaining = &self.remaining[2..];
145 return Some(Ok(unsafe { char::from_u32_unchecked(point) }));
146 }
147 let third = self.remaining[2];
150 if first < 0xF0 {
151 if ((UTF8_DATA.table[usize::from(second)]
152 & UTF8_DATA.table[usize::from(first) + 0x80])
153 | (third >> 6))
154 != 2
155 {
156 break;
157 }
158 let point = ((u32::from(first) & 0xF) << 12)
159 | ((u32::from(second) & 0x3F) << 6)
160 | (u32::from(third) & 0x3F);
161 self.remaining = &self.remaining[3..];
162 return Some(Ok(unsafe { char::from_u32_unchecked(point) }));
163 }
164 let fourth = self.remaining[3];
165 if (u16::from(
166 UTF8_DATA.table[usize::from(second)] & UTF8_DATA.table[usize::from(first) + 0x80],
167 ) | u16::from(third >> 6)
168 | (u16::from(fourth & 0xC0) << 2))
169 != 0x202
170 {
171 break;
172 }
173 let point = ((u32::from(first) & 0x7) << 18)
174 | ((u32::from(second) & 0x3F) << 12)
175 | ((u32::from(third) & 0x3F) << 6)
176 | (u32::from(fourth) & 0x3F);
177 self.remaining = &self.remaining[4..];
178 return Some(Ok(unsafe { char::from_u32_unchecked(point) }));
179 }
180 self.next_fallback()
181 }
182}
183
184impl<'a> DoubleEndedIterator for ErrorReportingUtf8Chars<'a> {
185 #[inline]
186 fn next_back(&mut self) -> Option<Result<char, Utf8CharsError>> {
187 if self.remaining.is_empty() {
188 return None;
189 }
190 let mut attempt = 1;
191 for b in self.remaining.iter().rev() {
192 if b & 0xC0 != 0x80 {
193 let (head, tail) = self.remaining.split_at(self.remaining.len() - attempt);
194 let mut inner = ErrorReportingUtf8Chars::new(tail);
195 let candidate = inner.next();
196 if inner.as_slice().is_empty() {
197 self.remaining = head;
198 return candidate;
199 }
200 break;
201 }
202 if attempt == 4 {
203 break;
204 }
205 attempt += 1;
206 }
207
208 self.remaining = &self.remaining[..self.remaining.len() - 1];
209 Some(Err(Utf8CharsError))
210 }
211}
212
213impl FusedIterator for ErrorReportingUtf8Chars<'_> {}
214
215#[cfg(test)]
216mod tests {
217 use crate::ErrorReportingUtf8Chars;
218
219 #[test]
221 fn test_size() {
222 assert_eq!(
223 core::mem::size_of::<Option<<ErrorReportingUtf8Chars<'_> as Iterator>::Item>>(),
224 core::mem::size_of::<Option<char>>()
225 );
226 }
227
228 #[test]
229 fn test_eq() {
230 let a: <ErrorReportingUtf8Chars<'_> as Iterator>::Item = Ok('a');
231 let a_again: <ErrorReportingUtf8Chars<'_> as Iterator>::Item = Ok('a');
232 assert_eq!(a, a_again);
233 }
234}