utf16_iter/
report.rs

1// Copyright Mozilla Foundation
2//
3// Licensed under the Apache License (Version 2.0), or the MIT license,
4// (the "Licenses") at your option. You may not use this file except in
5// compliance with one of the Licenses. You may obtain copies of the
6// Licenses at:
7//
8//    https://www.apache.org/licenses/LICENSE-2.0
9//    https://opensource.org/licenses/MIT
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the Licenses is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the Licenses for the specific language governing permissions and
15// limitations under the Licenses.
16
17use crate::in_inclusive_range16;
18use core::fmt::Formatter;
19use core::iter::FusedIterator;
20
21/// A type for signaling UTF-16 errors.
22///
23/// The value of the unpaired surrogate is not exposed in order
24/// to keep the `Result` type (and `Option`-wrapping thereof)
25/// the same size as `char`. See an [issue about the representation][1].
26///
27/// Note: `core::error::Error` is not implemented due to implementing it
28/// being an [unstable feature][2] at the time of writing.
29///
30/// [1]: https://github.com/rust-lang/rust/issues/118367
31/// [2]: https://github.com/rust-lang/rust/issues/103765
32#[derive(Debug, PartialEq)]
33#[non_exhaustive]
34pub struct Utf16CharsError;
35
36impl core::fmt::Display for Utf16CharsError {
37    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), core::fmt::Error> {
38        write!(f, "unpaired surrogate")
39    }
40}
41
42/// Iterator by `Result<char,Utf16CharsError>` over `&[u16]` that contains
43/// potentially-invalid UTF-16. There is exactly one `Utf16CharsError` per
44/// each unpaired surrogate.
45#[derive(Debug, Clone)]
46pub struct ErrorReportingUtf16Chars<'a> {
47    remaining: &'a [u16],
48}
49
50impl<'a> ErrorReportingUtf16Chars<'a> {
51    #[inline(always)]
52    /// Creates the iterator from a `u16` slice.
53    pub fn new(code_units: &'a [u16]) -> Self {
54        ErrorReportingUtf16Chars::<'a> {
55            remaining: code_units,
56        }
57    }
58
59    /// Views the current remaining data in the iterator as a subslice
60    /// of the original slice.
61    #[inline(always)]
62    pub fn as_slice(&self) -> &'a [u16] {
63        self.remaining
64    }
65
66    #[inline(never)]
67    fn surrogate_next(&mut self, surrogate_base: u16, first: u16) -> Result<char, Utf16CharsError> {
68        if surrogate_base <= (0xDBFF - 0xD800) {
69            if let Some((&low, tail_tail)) = self.remaining.split_first() {
70                if in_inclusive_range16(low, 0xDC00, 0xDFFF) {
71                    self.remaining = tail_tail;
72                    return Ok(unsafe {
73                        char::from_u32_unchecked(
74                            (u32::from(first) << 10) + u32::from(low)
75                                - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
76                        )
77                    });
78                }
79            }
80        }
81        Err(Utf16CharsError)
82    }
83
84    #[inline(never)]
85    fn surrogate_next_back(&mut self, last: u16) -> Result<char, Utf16CharsError> {
86        if in_inclusive_range16(last, 0xDC00, 0xDFFF) {
87            if let Some((&high, head_head)) = self.remaining.split_last() {
88                if in_inclusive_range16(high, 0xD800, 0xDBFF) {
89                    self.remaining = head_head;
90                    return Ok(unsafe {
91                        char::from_u32_unchecked(
92                            (u32::from(high) << 10) + u32::from(last)
93                                - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
94                        )
95                    });
96                }
97            }
98        }
99        Err(Utf16CharsError)
100    }
101}
102
103impl<'a> Iterator for ErrorReportingUtf16Chars<'a> {
104    type Item = Result<char, Utf16CharsError>;
105
106    #[inline(always)]
107    fn next(&mut self) -> Option<Result<char, Utf16CharsError>> {
108        // Not delegating directly to `ErrorReportingUtf16Chars` to avoid
109        // an extra branch in the common case based on an inspection of
110        // generated code. Be sure to inspect the generated code as inlined
111        // into an actual usage site carefully if attempting to consolidate
112        // the source code here.
113        let (&first, tail) = self.remaining.split_first()?;
114        self.remaining = tail;
115        let surrogate_base = first.wrapping_sub(0xD800);
116        if surrogate_base > (0xDFFF - 0xD800) {
117            return Some(Ok(unsafe { char::from_u32_unchecked(u32::from(first)) }));
118        }
119        Some(self.surrogate_next(surrogate_base, first))
120    }
121}
122
123impl<'a> DoubleEndedIterator for ErrorReportingUtf16Chars<'a> {
124    #[inline(always)]
125    fn next_back(&mut self) -> Option<Result<char, Utf16CharsError>> {
126        let (&last, head) = self.remaining.split_last()?;
127        self.remaining = head;
128        if !in_inclusive_range16(last, 0xD800, 0xDFFF) {
129            return Some(Ok(unsafe { char::from_u32_unchecked(u32::from(last)) }));
130        }
131        Some(self.surrogate_next_back(last))
132    }
133}
134
135impl FusedIterator for ErrorReportingUtf16Chars<'_> {}
136
137#[cfg(test)]
138mod tests {
139    use crate::ErrorReportingUtf16Chars;
140    use crate::Utf16CharsEx;
141
142    #[test]
143    fn test_boundaries() {
144        assert!(ErrorReportingUtf16Chars::new([0xD7FFu16].as_slice())
145            .map(|r| r.unwrap_or('\u{FFFD}'))
146            .eq(core::iter::once('\u{D7FF}')));
147        assert!(ErrorReportingUtf16Chars::new([0xE000u16].as_slice())
148            .map(|r| r.unwrap_or('\u{FFFD}'))
149            .eq(core::iter::once('\u{E000}')));
150        assert!(ErrorReportingUtf16Chars::new([0xD800u16].as_slice())
151            .map(|r| r.unwrap_or('\u{FFFD}'))
152            .eq(core::iter::once('\u{FFFD}')));
153        assert!(ErrorReportingUtf16Chars::new([0xDFFFu16].as_slice())
154            .map(|r| r.unwrap_or('\u{FFFD}'))
155            .eq(core::iter::once('\u{FFFD}')));
156    }
157
158    #[test]
159    fn test_unpaired() {
160        assert!(
161            ErrorReportingUtf16Chars::new([0xD800u16, 0x0061u16].as_slice())
162                .map(|r| r.unwrap_or('\u{FFFD}'))
163                .eq([0xFFFDu16, 0x0061u16].as_slice().chars())
164        );
165        assert!(
166            ErrorReportingUtf16Chars::new([0xDFFFu16, 0x0061u16].as_slice())
167                .map(|r| r.unwrap_or('\u{FFFD}'))
168                .eq([0xFFFDu16, 0x0061u16].as_slice().chars())
169        );
170    }
171
172    #[test]
173    fn test_unpaired_rev() {
174        assert!(
175            ErrorReportingUtf16Chars::new([0xD800u16, 0x0061u16].as_slice())
176                .rev()
177                .map(|r| r.unwrap_or('\u{FFFD}'))
178                .eq([0xFFFDu16, 0x0061u16].as_slice().chars().rev())
179        );
180        assert!(
181            ErrorReportingUtf16Chars::new([0xDFFFu16, 0x0061u16].as_slice())
182                .rev()
183                .map(|r| r.unwrap_or('\u{FFFD}'))
184                .eq([0xFFFDu16, 0x0061u16].as_slice().chars().rev())
185        );
186    }
187
188    #[test]
189    fn test_paired() {
190        assert!(
191            ErrorReportingUtf16Chars::new([0xD83Eu16, 0xDD73u16].as_slice())
192                .map(|r| r.unwrap_or('\u{FFFD}'))
193                .eq(core::iter::once('🥳'))
194        );
195    }
196
197    #[test]
198    fn test_paired_rev() {
199        assert!(
200            ErrorReportingUtf16Chars::new([0xD83Eu16, 0xDD73u16].as_slice())
201                .rev()
202                .map(|r| r.unwrap_or('\u{FFFD}'))
203                .eq(core::iter::once('🥳'))
204        );
205    }
206
207    #[test]
208    fn test_as_slice() {
209        let mut iter = ErrorReportingUtf16Chars::new([0x0061u16, 0x0062u16].as_slice());
210        let at_start = iter.as_slice();
211        assert_eq!(iter.next(), Some(Ok('a')));
212        let in_middle = iter.as_slice();
213        assert_eq!(iter.next(), Some(Ok('b')));
214        let at_end = iter.as_slice();
215        assert_eq!(at_start.len(), 2);
216        assert_eq!(in_middle.len(), 1);
217        assert_eq!(at_end.len(), 0);
218        assert_eq!(at_start[0], 0x0061u16);
219        assert_eq!(at_start[1], 0x0062u16);
220        assert_eq!(in_middle[0], 0x0062u16);
221    }
222
223    // Should be a static assert, but not taking a dependency for this.
224    #[test]
225    fn test_size() {
226        assert_eq!(
227            core::mem::size_of::<Option<<ErrorReportingUtf16Chars<'_> as Iterator>::Item>>(),
228            core::mem::size_of::<Option<char>>()
229        );
230    }
231}