1#![no_std]
18
19mod indices;
37mod report;
38
39pub use crate::indices::Utf8CharIndices;
40pub use crate::report::ErrorReportingUtf8Chars;
41pub use crate::report::Utf8CharsError;
42use core::iter::FusedIterator;
43
44#[repr(align(64))] struct Utf8Data {
46 pub table: [u8; 384],
47}
48
49static UTF8_DATA: Utf8Data = Utf8Data {
54 table: [
55 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
56 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
57 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
58 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
59 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
60 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
61 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
62 252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148,
63 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164,
64 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164,
65 164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252,
66 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
67 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
68 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
69 252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
70 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
71 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
72 8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4,
73 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
74 ],
75};
76
77#[inline(always)]
80fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
81 i.wrapping_sub(start) <= (end - start)
82}
83
84#[derive(Debug, Clone)]
87pub struct Utf8Chars<'a> {
88 remaining: &'a [u8],
89}
90
91impl<'a> Utf8Chars<'a> {
92 #[inline(always)]
93 pub fn new(bytes: &'a [u8]) -> Self {
95 Utf8Chars::<'a> { remaining: bytes }
96 }
97
98 #[inline(always)]
101 pub fn as_slice(&self) -> &'a [u8] {
102 self.remaining
103 }
104
105 #[inline(never)]
106 fn next_fallback(&mut self) -> Option<char> {
107 if self.remaining.is_empty() {
108 return None;
109 }
110 let first = self.remaining[0];
111 if first < 0x80 {
112 self.remaining = &self.remaining[1..];
113 return Some(char::from(first));
114 }
115 if !in_inclusive_range8(first, 0xC2, 0xF4) || self.remaining.len() == 1 {
116 self.remaining = &self.remaining[1..];
117 return Some('\u{FFFD}');
118 }
119 let second = self.remaining[1];
120 let (lower_bound, upper_bound) = match first {
121 0xE0 => (0xA0, 0xBF),
122 0xED => (0x80, 0x9F),
123 0xF0 => (0x90, 0xBF),
124 0xF4 => (0x80, 0x8F),
125 _ => (0x80, 0xBF),
126 };
127 if !in_inclusive_range8(second, lower_bound, upper_bound) {
128 self.remaining = &self.remaining[1..];
129 return Some('\u{FFFD}');
130 }
131 if first < 0xE0 {
132 self.remaining = &self.remaining[2..];
133 let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
134 return Some(unsafe { char::from_u32_unchecked(point) });
135 }
136 if self.remaining.len() == 2 {
137 self.remaining = &self.remaining[2..];
138 return Some('\u{FFFD}');
139 }
140 let third = self.remaining[2];
141 if !in_inclusive_range8(third, 0x80, 0xBF) {
142 self.remaining = &self.remaining[2..];
143 return Some('\u{FFFD}');
144 }
145 if first < 0xF0 {
146 self.remaining = &self.remaining[3..];
147 let point = ((u32::from(first) & 0xF) << 12)
148 | ((u32::from(second) & 0x3F) << 6)
149 | (u32::from(third) & 0x3F);
150 return Some(unsafe { char::from_u32_unchecked(point) });
151 }
152 self.remaining = &self.remaining[3..];
156 Some('\u{FFFD}')
157 }
158}
159
160impl<'a> Iterator for Utf8Chars<'a> {
161 type Item = char;
162
163 #[inline]
164 fn next(&mut self) -> Option<char> {
165 #[allow(clippy::never_loop)]
173 loop {
174 if self.remaining.len() < 4 {
175 break;
176 }
177 let first = self.remaining[0];
178 if first < 0x80 {
179 self.remaining = &self.remaining[1..];
180 return Some(char::from(first));
181 }
182 let second = self.remaining[1];
183 if in_inclusive_range8(first, 0xC2, 0xDF) {
184 if !in_inclusive_range8(second, 0x80, 0xBF) {
185 break;
186 }
187 let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
188 self.remaining = &self.remaining[2..];
189 return Some(unsafe { char::from_u32_unchecked(point) });
190 }
191 let third = self.remaining[2];
194 if first < 0xF0 {
195 if ((UTF8_DATA.table[usize::from(second)]
196 & UTF8_DATA.table[usize::from(first) + 0x80])
197 | (third >> 6))
198 != 2
199 {
200 break;
201 }
202 let point = ((u32::from(first) & 0xF) << 12)
203 | ((u32::from(second) & 0x3F) << 6)
204 | (u32::from(third) & 0x3F);
205 self.remaining = &self.remaining[3..];
206 return Some(unsafe { char::from_u32_unchecked(point) });
207 }
208 let fourth = self.remaining[3];
209 if (u16::from(
210 UTF8_DATA.table[usize::from(second)] & UTF8_DATA.table[usize::from(first) + 0x80],
211 ) | u16::from(third >> 6)
212 | (u16::from(fourth & 0xC0) << 2))
213 != 0x202
214 {
215 break;
216 }
217 let point = ((u32::from(first) & 0x7) << 18)
218 | ((u32::from(second) & 0x3F) << 12)
219 | ((u32::from(third) & 0x3F) << 6)
220 | (u32::from(fourth) & 0x3F);
221 self.remaining = &self.remaining[4..];
222 return Some(unsafe { char::from_u32_unchecked(point) });
223 }
224 self.next_fallback()
225 }
226}
227
228impl<'a> DoubleEndedIterator for Utf8Chars<'a> {
229 #[inline]
230 fn next_back(&mut self) -> Option<char> {
231 if self.remaining.is_empty() {
232 return None;
233 }
234 let mut attempt = 1;
235 for b in self.remaining.iter().rev() {
236 if b & 0xC0 != 0x80 {
237 let (head, tail) = self.remaining.split_at(self.remaining.len() - attempt);
238 let mut inner = Utf8Chars::new(tail);
239 let candidate = inner.next();
240 if inner.as_slice().is_empty() {
241 self.remaining = head;
242 return candidate;
243 }
244 break;
245 }
246 if attempt == 4 {
247 break;
248 }
249 attempt += 1;
250 }
251
252 self.remaining = &self.remaining[..self.remaining.len() - 1];
253 Some('\u{FFFD}')
254 }
255}
256
257impl FusedIterator for Utf8Chars<'_> {}
258
259pub trait Utf8CharsEx {
262 fn chars(&self) -> Utf8Chars<'_>;
263 fn char_indices(&self) -> Utf8CharIndices<'_>;
264}
265
266impl Utf8CharsEx for [u8] {
267 #[inline]
270 fn chars(&self) -> Utf8Chars<'_> {
271 Utf8Chars::new(self)
272 }
273 #[inline]
276 fn char_indices(&self) -> Utf8CharIndices<'_> {
277 Utf8CharIndices::new(self)
278 }
279}
280
281