utf8_iter/
report.rs

1// Copyright Mozilla Foundation
2//
3// Licensed under the Apache License (Version 2.0), or the MIT license,
4// (the "Licenses") at your option. You may not use this file except in
5// compliance with one of the Licenses. You may obtain copies of the
6// Licenses at:
7//
8//    https://www.apache.org/licenses/LICENSE-2.0
9//    https://opensource.org/licenses/MIT
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the Licenses is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the Licenses for the specific language governing permissions and
15// limitations under the Licenses.
16
17use crate::in_inclusive_range8;
18use crate::UTF8_DATA;
19use core::fmt::Formatter;
20use core::iter::FusedIterator;
21
22/// A type for signaling UTF-8 errors.
23///
24/// Note: `core::error::Error` is not implemented due to implementing it
25/// being an [unstable feature][1] at the time of writing.
26///
27/// [1]: https://github.com/rust-lang/rust/issues/103765
28#[derive(Debug, PartialEq)]
29#[non_exhaustive]
30pub struct Utf8CharsError;
31
32impl core::fmt::Display for Utf8CharsError {
33    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), core::fmt::Error> {
34        write!(f, "byte sequence not well-formed UTF-8")
35    }
36}
37
38/// Iterator by `Result<char,Utf8CharsError>` over `&[u8]` that contains
39/// potentially-invalid UTF-8. There is exactly one `Utf8CharsError` per
40/// each error as defined by the WHATWG Encoding Standard.
41///
42/// ```
43/// let s = b"a\xFFb\xFF\x80c\xF0\x9F\xA4\xA6\xF0\x9F\xA4\xF0\x9F\xF0d";
44/// let plain = utf8_iter::Utf8Chars::new(s);
45/// let reporting = utf8_iter::ErrorReportingUtf8Chars::new(s);
46/// assert!(plain.eq(reporting.map(|r| r.unwrap_or('\u{FFFD}'))));
47/// ```
48#[derive(Debug, Clone)]
49pub struct ErrorReportingUtf8Chars<'a> {
50    remaining: &'a [u8],
51}
52
53impl<'a> ErrorReportingUtf8Chars<'a> {
54    #[inline(always)]
55    /// Creates the iterator from a byte slice.
56    pub fn new(bytes: &'a [u8]) -> Self {
57        ErrorReportingUtf8Chars::<'a> { remaining: bytes }
58    }
59
60    /// Views the current remaining data in the iterator as a subslice
61    /// of the original slice.
62    #[inline(always)]
63    pub fn as_slice(&self) -> &'a [u8] {
64        self.remaining
65    }
66
67    #[inline(never)]
68    fn next_fallback(&mut self) -> Option<Result<char, Utf8CharsError>> {
69        if self.remaining.is_empty() {
70            return None;
71        }
72        let first = self.remaining[0];
73        if first < 0x80 {
74            self.remaining = &self.remaining[1..];
75            return Some(Ok(char::from(first)));
76        }
77        if !in_inclusive_range8(first, 0xC2, 0xF4) || self.remaining.len() == 1 {
78            self.remaining = &self.remaining[1..];
79            return Some(Err(Utf8CharsError));
80        }
81        let second = self.remaining[1];
82        let (lower_bound, upper_bound) = match first {
83            0xE0 => (0xA0, 0xBF),
84            0xED => (0x80, 0x9F),
85            0xF0 => (0x90, 0xBF),
86            0xF4 => (0x80, 0x8F),
87            _ => (0x80, 0xBF),
88        };
89        if !in_inclusive_range8(second, lower_bound, upper_bound) {
90            self.remaining = &self.remaining[1..];
91            return Some(Err(Utf8CharsError));
92        }
93        if first < 0xE0 {
94            self.remaining = &self.remaining[2..];
95            let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
96            return Some(Ok(unsafe { char::from_u32_unchecked(point) }));
97        }
98        if self.remaining.len() == 2 {
99            self.remaining = &self.remaining[2..];
100            return Some(Err(Utf8CharsError));
101        }
102        let third = self.remaining[2];
103        if !in_inclusive_range8(third, 0x80, 0xBF) {
104            self.remaining = &self.remaining[2..];
105            return Some(Err(Utf8CharsError));
106        }
107        if first < 0xF0 {
108            self.remaining = &self.remaining[3..];
109            let point = ((u32::from(first) & 0xF) << 12)
110                | ((u32::from(second) & 0x3F) << 6)
111                | (u32::from(third) & 0x3F);
112            return Some(Ok(unsafe { char::from_u32_unchecked(point) }));
113        }
114        // At this point, we have a valid 3-byte prefix of a
115        // four-byte sequence that has to be incomplete, because
116        // otherwise `next()` would have succeeded.
117        self.remaining = &self.remaining[3..];
118        Some(Err(Utf8CharsError))
119    }
120}
121
122impl<'a> Iterator for ErrorReportingUtf8Chars<'a> {
123    type Item = Result<char, Utf8CharsError>;
124
125    #[inline]
126    fn next(&mut self) -> Option<Result<char, Utf8CharsError>> {
127        // This loop is only broken out of as goto forward
128        #[allow(clippy::never_loop)]
129        loop {
130            if self.remaining.len() < 4 {
131                break;
132            }
133            let first = self.remaining[0];
134            if first < 0x80 {
135                self.remaining = &self.remaining[1..];
136                return Some(Ok(char::from(first)));
137            }
138            let second = self.remaining[1];
139            if in_inclusive_range8(first, 0xC2, 0xDF) {
140                if !in_inclusive_range8(second, 0x80, 0xBF) {
141                    break;
142                }
143                let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
144                self.remaining = &self.remaining[2..];
145                return Some(Ok(unsafe { char::from_u32_unchecked(point) }));
146            }
147            // This table-based formulation was benchmark-based in encoding_rs,
148            // but it hasn't been re-benchmarked in this iterator context.
149            let third = self.remaining[2];
150            if first < 0xF0 {
151                if ((UTF8_DATA.table[usize::from(second)]
152                    & UTF8_DATA.table[usize::from(first) + 0x80])
153                    | (third >> 6))
154                    != 2
155                {
156                    break;
157                }
158                let point = ((u32::from(first) & 0xF) << 12)
159                    | ((u32::from(second) & 0x3F) << 6)
160                    | (u32::from(third) & 0x3F);
161                self.remaining = &self.remaining[3..];
162                return Some(Ok(unsafe { char::from_u32_unchecked(point) }));
163            }
164            let fourth = self.remaining[3];
165            if (u16::from(
166                UTF8_DATA.table[usize::from(second)] & UTF8_DATA.table[usize::from(first) + 0x80],
167            ) | u16::from(third >> 6)
168                | (u16::from(fourth & 0xC0) << 2))
169                != 0x202
170            {
171                break;
172            }
173            let point = ((u32::from(first) & 0x7) << 18)
174                | ((u32::from(second) & 0x3F) << 12)
175                | ((u32::from(third) & 0x3F) << 6)
176                | (u32::from(fourth) & 0x3F);
177            self.remaining = &self.remaining[4..];
178            return Some(Ok(unsafe { char::from_u32_unchecked(point) }));
179        }
180        self.next_fallback()
181    }
182}
183
184impl<'a> DoubleEndedIterator for ErrorReportingUtf8Chars<'a> {
185    #[inline]
186    fn next_back(&mut self) -> Option<Result<char, Utf8CharsError>> {
187        if self.remaining.is_empty() {
188            return None;
189        }
190        let mut attempt = 1;
191        for b in self.remaining.iter().rev() {
192            if b & 0xC0 != 0x80 {
193                let (head, tail) = self.remaining.split_at(self.remaining.len() - attempt);
194                let mut inner = ErrorReportingUtf8Chars::new(tail);
195                let candidate = inner.next();
196                if inner.as_slice().is_empty() {
197                    self.remaining = head;
198                    return candidate;
199                }
200                break;
201            }
202            if attempt == 4 {
203                break;
204            }
205            attempt += 1;
206        }
207
208        self.remaining = &self.remaining[..self.remaining.len() - 1];
209        Some(Err(Utf8CharsError))
210    }
211}
212
213impl FusedIterator for ErrorReportingUtf8Chars<'_> {}
214
215#[cfg(test)]
216mod tests {
217    use crate::ErrorReportingUtf8Chars;
218
219    // Should be a static assert, but not taking a dependency for this.
220    #[test]
221    fn test_size() {
222        assert_eq!(
223            core::mem::size_of::<Option<<ErrorReportingUtf8Chars<'_> as Iterator>::Item>>(),
224            core::mem::size_of::<Option<char>>()
225        );
226    }
227
228    #[test]
229    fn test_eq() {
230        let a: <ErrorReportingUtf8Chars<'_> as Iterator>::Item = Ok('a');
231        let a_again: <ErrorReportingUtf8Chars<'_> as Iterator>::Item = Ok('a');
232        assert_eq!(a, a_again);
233    }
234}