utf8_iter/
lib.rs

1// Copyright Mozilla Foundation
2//
3// Licensed under the Apache License (Version 2.0), or the MIT license,
4// (the "Licenses") at your option. You may not use this file except in
5// compliance with one of the Licenses. You may obtain copies of the
6// Licenses at:
7//
8//    https://www.apache.org/licenses/LICENSE-2.0
9//    https://opensource.org/licenses/MIT
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the Licenses is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the Licenses for the specific language governing permissions and
15// limitations under the Licenses.
16
17#![no_std]
18
19//! Provides iteration by `char` over `&[u8]` containing potentially-invalid
20//! UTF-8 such that errors are handled according to the [WHATWG Encoding
21//! Standard](https://encoding.spec.whatwg.org/#utf-8-decoder) (i.e. the same
22//! way as in `String::from_utf8_lossy`).
23//!
24//! The trait `Utf8CharsEx` provides the convenience method `chars()` on
25//! byte slices themselves instead of having to use the more verbose
26//! `Utf8Chars::new(slice)`.
27//!
28//! ```rust
29//! use utf8_iter::Utf8CharsEx;
30//! let data = b"\xFF\xC2\xE2\xE2\x98\xF0\xF0\x9F\xF0\x9F\x92\xE2\x98\x83";
31//! let from_iter: String = data.chars().collect();
32//! let from_std = String::from_utf8_lossy(data);
33//! assert_eq!(from_iter, from_std);
34//! ```
35
36mod indices;
37mod report;
38
39pub use crate::indices::Utf8CharIndices;
40pub use crate::report::ErrorReportingUtf8Chars;
41pub use crate::report::Utf8CharsError;
42use core::iter::FusedIterator;
43
44#[repr(align(64))] // Align to cache lines
45struct Utf8Data {
46    pub table: [u8; 384],
47}
48
49// This is generated code copied and pasted from utf_8.rs of encoding_rs.
50// Please don't edit by hand but instead regenerate as instructed in that
51// file.
52
53static UTF8_DATA: Utf8Data = Utf8Data {
54    table: [
55        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
56        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
57        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
58        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
59        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
60        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
61        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
62        252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148,
63        148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164,
64        164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164,
65        164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252,
66        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
67        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
68        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
69        252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
70        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
71        4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
72        8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4,
73        4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
74    ],
75};
76
77// End manually copypasted generated code.
78
79#[inline(always)]
80fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
81    i.wrapping_sub(start) <= (end - start)
82}
83
84/// Iterator by `char` over `&[u8]` that contains
85/// potentially-invalid UTF-8. See the crate documentation.
86#[derive(Debug, Clone)]
87pub struct Utf8Chars<'a> {
88    remaining: &'a [u8],
89}
90
91impl<'a> Utf8Chars<'a> {
92    #[inline(always)]
93    /// Creates the iterator from a byte slice.
94    pub fn new(bytes: &'a [u8]) -> Self {
95        Utf8Chars::<'a> { remaining: bytes }
96    }
97
98    /// Views the current remaining data in the iterator as a subslice
99    /// of the original slice.
100    #[inline(always)]
101    pub fn as_slice(&self) -> &'a [u8] {
102        self.remaining
103    }
104
105    #[inline(never)]
106    fn next_fallback(&mut self) -> Option<char> {
107        if self.remaining.is_empty() {
108            return None;
109        }
110        let first = self.remaining[0];
111        if first < 0x80 {
112            self.remaining = &self.remaining[1..];
113            return Some(char::from(first));
114        }
115        if !in_inclusive_range8(first, 0xC2, 0xF4) || self.remaining.len() == 1 {
116            self.remaining = &self.remaining[1..];
117            return Some('\u{FFFD}');
118        }
119        let second = self.remaining[1];
120        let (lower_bound, upper_bound) = match first {
121            0xE0 => (0xA0, 0xBF),
122            0xED => (0x80, 0x9F),
123            0xF0 => (0x90, 0xBF),
124            0xF4 => (0x80, 0x8F),
125            _ => (0x80, 0xBF),
126        };
127        if !in_inclusive_range8(second, lower_bound, upper_bound) {
128            self.remaining = &self.remaining[1..];
129            return Some('\u{FFFD}');
130        }
131        if first < 0xE0 {
132            self.remaining = &self.remaining[2..];
133            let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
134            return Some(unsafe { char::from_u32_unchecked(point) });
135        }
136        if self.remaining.len() == 2 {
137            self.remaining = &self.remaining[2..];
138            return Some('\u{FFFD}');
139        }
140        let third = self.remaining[2];
141        if !in_inclusive_range8(third, 0x80, 0xBF) {
142            self.remaining = &self.remaining[2..];
143            return Some('\u{FFFD}');
144        }
145        if first < 0xF0 {
146            self.remaining = &self.remaining[3..];
147            let point = ((u32::from(first) & 0xF) << 12)
148                | ((u32::from(second) & 0x3F) << 6)
149                | (u32::from(third) & 0x3F);
150            return Some(unsafe { char::from_u32_unchecked(point) });
151        }
152        // At this point, we have a valid 3-byte prefix of a
153        // four-byte sequence that has to be incomplete, because
154        // otherwise `next()` would have succeeded.
155        self.remaining = &self.remaining[3..];
156        Some('\u{FFFD}')
157    }
158}
159
160impl<'a> Iterator for Utf8Chars<'a> {
161    type Item = char;
162
163    #[inline]
164    fn next(&mut self) -> Option<char> {
165        // Not delegating directly to `ErrorReportingUtf8Chars` to avoid
166        // an extra branch in the common case based on a cursory inspection
167        // of generated code in a similar case. Be sure to inspect the
168        // generated code as inlined into an actual usage site carefully
169        // if attempting to consolidate the source code here.
170
171        // This loop is only broken out of as goto forward
172        #[allow(clippy::never_loop)]
173        loop {
174            if self.remaining.len() < 4 {
175                break;
176            }
177            let first = self.remaining[0];
178            if first < 0x80 {
179                self.remaining = &self.remaining[1..];
180                return Some(char::from(first));
181            }
182            let second = self.remaining[1];
183            if in_inclusive_range8(first, 0xC2, 0xDF) {
184                if !in_inclusive_range8(second, 0x80, 0xBF) {
185                    break;
186                }
187                let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
188                self.remaining = &self.remaining[2..];
189                return Some(unsafe { char::from_u32_unchecked(point) });
190            }
191            // This table-based formulation was benchmark-based in encoding_rs,
192            // but it hasn't been re-benchmarked in this iterator context.
193            let third = self.remaining[2];
194            if first < 0xF0 {
195                if ((UTF8_DATA.table[usize::from(second)]
196                    & UTF8_DATA.table[usize::from(first) + 0x80])
197                    | (third >> 6))
198                    != 2
199                {
200                    break;
201                }
202                let point = ((u32::from(first) & 0xF) << 12)
203                    | ((u32::from(second) & 0x3F) << 6)
204                    | (u32::from(third) & 0x3F);
205                self.remaining = &self.remaining[3..];
206                return Some(unsafe { char::from_u32_unchecked(point) });
207            }
208            let fourth = self.remaining[3];
209            if (u16::from(
210                UTF8_DATA.table[usize::from(second)] & UTF8_DATA.table[usize::from(first) + 0x80],
211            ) | u16::from(third >> 6)
212                | (u16::from(fourth & 0xC0) << 2))
213                != 0x202
214            {
215                break;
216            }
217            let point = ((u32::from(first) & 0x7) << 18)
218                | ((u32::from(second) & 0x3F) << 12)
219                | ((u32::from(third) & 0x3F) << 6)
220                | (u32::from(fourth) & 0x3F);
221            self.remaining = &self.remaining[4..];
222            return Some(unsafe { char::from_u32_unchecked(point) });
223        }
224        self.next_fallback()
225    }
226}
227
228impl<'a> DoubleEndedIterator for Utf8Chars<'a> {
229    #[inline]
230    fn next_back(&mut self) -> Option<char> {
231        if self.remaining.is_empty() {
232            return None;
233        }
234        let mut attempt = 1;
235        for b in self.remaining.iter().rev() {
236            if b & 0xC0 != 0x80 {
237                let (head, tail) = self.remaining.split_at(self.remaining.len() - attempt);
238                let mut inner = Utf8Chars::new(tail);
239                let candidate = inner.next();
240                if inner.as_slice().is_empty() {
241                    self.remaining = head;
242                    return candidate;
243                }
244                break;
245            }
246            if attempt == 4 {
247                break;
248            }
249            attempt += 1;
250        }
251
252        self.remaining = &self.remaining[..self.remaining.len() - 1];
253        Some('\u{FFFD}')
254    }
255}
256
257impl FusedIterator for Utf8Chars<'_> {}
258
259/// Convenience trait that adds `chars()` and `char_indices()` methods
260/// similar to the ones on string slices to byte slices.
261pub trait Utf8CharsEx {
262    fn chars(&self) -> Utf8Chars<'_>;
263    fn char_indices(&self) -> Utf8CharIndices<'_>;
264}
265
266impl Utf8CharsEx for [u8] {
267    /// Convenience method for creating an UTF-8 iterator
268    /// for the slice.
269    #[inline]
270    fn chars(&self) -> Utf8Chars<'_> {
271        Utf8Chars::new(self)
272    }
273    /// Convenience method for creating a byte index and
274    /// UTF-8 iterator for the slice.
275    #[inline]
276    fn char_indices(&self) -> Utf8CharIndices<'_> {
277        Utf8CharIndices::new(self)
278    }
279}
280
281// No manually-written tests for forward-iteration, because the code passed multiple
282// days of fuzzing comparing with known-good behavior.