utf16_iter/
lib.rs

1// Copyright Mozilla Foundation
2//
3// Licensed under the Apache License (Version 2.0), or the MIT license,
4// (the "Licenses") at your option. You may not use this file except in
5// compliance with one of the Licenses. You may obtain copies of the
6// Licenses at:
7//
8//    https://www.apache.org/licenses/LICENSE-2.0
9//    https://opensource.org/licenses/MIT
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the Licenses is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the Licenses for the specific language governing permissions and
15// limitations under the Licenses.
16
17#![no_std]
18
19//! Provides iteration by `char` over `&[u16]` containing potentially-invalid
20//! UTF-16 such that errors are replaced with the REPLACEMENT CHARACTER.
21//!
22//! The trait `Utf16CharsEx` provides the convenience method `chars()` on
23//! byte slices themselves instead of having to use the more verbose
24//! `Utf16Chars::new(slice)`.
25
26mod indices;
27mod report;
28
29pub use crate::indices::Utf16CharIndices;
30pub use crate::report::ErrorReportingUtf16Chars;
31pub use crate::report::Utf16CharsError;
32use core::iter::FusedIterator;
33
34#[inline(always)]
35fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
36    i.wrapping_sub(start) <= (end - start)
37}
38
39/// Iterator by `char` over `&[u16]` that contains
40/// potentially-invalid UTF-16. See the crate documentation.
41#[derive(Debug, Clone)]
42pub struct Utf16Chars<'a> {
43    remaining: &'a [u16],
44}
45
46impl<'a> Utf16Chars<'a> {
47    #[inline(always)]
48    /// Creates the iterator from a `u16` slice.
49    pub fn new(code_units: &'a [u16]) -> Self {
50        Utf16Chars::<'a> {
51            remaining: code_units,
52        }
53    }
54
55    /// Views the current remaining data in the iterator as a subslice
56    /// of the original slice.
57    #[inline(always)]
58    pub fn as_slice(&self) -> &'a [u16] {
59        self.remaining
60    }
61
62    #[inline(never)]
63    fn surrogate_next(&mut self, surrogate_base: u16, first: u16) -> char {
64        if surrogate_base <= (0xDBFF - 0xD800) {
65            if let Some((&low, tail_tail)) = self.remaining.split_first() {
66                if in_inclusive_range16(low, 0xDC00, 0xDFFF) {
67                    self.remaining = tail_tail;
68                    return unsafe {
69                        char::from_u32_unchecked(
70                            (u32::from(first) << 10) + u32::from(low)
71                                - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
72                        )
73                    };
74                }
75            }
76        }
77        '\u{FFFD}'
78    }
79
80    #[inline(never)]
81    fn surrogate_next_back(&mut self, last: u16) -> char {
82        if in_inclusive_range16(last, 0xDC00, 0xDFFF) {
83            if let Some((&high, head_head)) = self.remaining.split_last() {
84                if in_inclusive_range16(high, 0xD800, 0xDBFF) {
85                    self.remaining = head_head;
86                    return unsafe {
87                        char::from_u32_unchecked(
88                            (u32::from(high) << 10) + u32::from(last)
89                                - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
90                        )
91                    };
92                }
93            }
94        }
95        '\u{FFFD}'
96    }
97}
98
99impl<'a> Iterator for Utf16Chars<'a> {
100    type Item = char;
101
102    #[inline(always)]
103    fn next(&mut self) -> Option<char> {
104        // It might be OK to delegate to `ErrorReportingUtf16Chars`, but since
105        // the methods are rather small, copypaste is probably clearer. Also,
106        // copypaste would _not_ be equivalent if any part of this was delegated
107        // to an `inline(never)` helper. However, previous experimentation indicated
108        // that such a helper didn't help performance here.
109        let (&first, tail) = self.remaining.split_first()?;
110        self.remaining = tail;
111        let surrogate_base = first.wrapping_sub(0xD800);
112        if surrogate_base > (0xDFFF - 0xD800) {
113            return Some(unsafe { char::from_u32_unchecked(u32::from(first)) });
114        }
115        Some(self.surrogate_next(surrogate_base, first))
116    }
117}
118
119impl<'a> DoubleEndedIterator for Utf16Chars<'a> {
120    #[inline(always)]
121    fn next_back(&mut self) -> Option<char> {
122        let (&last, head) = self.remaining.split_last()?;
123        self.remaining = head;
124        if !in_inclusive_range16(last, 0xD800, 0xDFFF) {
125            return Some(unsafe { char::from_u32_unchecked(u32::from(last)) });
126        }
127        Some(self.surrogate_next_back(last))
128    }
129}
130
131impl FusedIterator for Utf16Chars<'_> {}
132
133/// Convenience trait that adds `chars()` and `char_indices()` methods
134/// similar to the ones on string slices to `u16` slices.
135pub trait Utf16CharsEx {
136    fn chars(&self) -> Utf16Chars<'_>;
137    fn char_indices(&self) -> Utf16CharIndices<'_>;
138}
139
140impl Utf16CharsEx for [u16] {
141    /// Convenience method for creating an UTF-16 iterator
142    /// for the slice.
143    #[inline]
144    fn chars(&self) -> Utf16Chars<'_> {
145        Utf16Chars::new(self)
146    }
147    /// Convenience method for creating a code unit index and
148    /// UTF-16 iterator for the slice.
149    #[inline]
150    fn char_indices(&self) -> Utf16CharIndices<'_> {
151        Utf16CharIndices::new(self)
152    }
153}
154
155#[cfg(test)]
156mod tests {
157    use crate::Utf16CharsEx;
158
159    #[test]
160    fn test_boundaries() {
161        assert!([0xD7FFu16]
162            .as_slice()
163            .chars()
164            .eq(core::iter::once('\u{D7FF}')));
165        assert!([0xE000u16]
166            .as_slice()
167            .chars()
168            .eq(core::iter::once('\u{E000}')));
169        assert!([0xD800u16]
170            .as_slice()
171            .chars()
172            .eq(core::iter::once('\u{FFFD}')));
173        assert!([0xDFFFu16]
174            .as_slice()
175            .chars()
176            .eq(core::iter::once('\u{FFFD}')));
177    }
178
179    #[test]
180    fn test_unpaired() {
181        assert!([0xD800u16, 0x0061u16]
182            .as_slice()
183            .chars()
184            .eq([0xFFFDu16, 0x0061u16].as_slice().chars()));
185        assert!([0xDFFFu16, 0x0061u16]
186            .as_slice()
187            .chars()
188            .eq([0xFFFDu16, 0x0061u16].as_slice().chars()));
189    }
190
191    #[test]
192    fn test_unpaired_rev() {
193        assert!([0xD800u16, 0x0061u16]
194            .as_slice()
195            .chars()
196            .rev()
197            .eq([0xFFFDu16, 0x0061u16].as_slice().chars().rev()));
198        assert!([0xDFFFu16, 0x0061u16]
199            .as_slice()
200            .chars()
201            .rev()
202            .eq([0xFFFDu16, 0x0061u16].as_slice().chars().rev()));
203    }
204
205    #[test]
206    fn test_paired() {
207        assert!([0xD83Eu16, 0xDD73u16]
208            .as_slice()
209            .chars()
210            .eq(core::iter::once('🥳')));
211    }
212
213    #[test]
214    fn test_paired_rev() {
215        assert!([0xD83Eu16, 0xDD73u16]
216            .as_slice()
217            .chars()
218            .rev()
219            .eq(core::iter::once('🥳')));
220    }
221
222    #[test]
223    fn test_as_slice() {
224        let mut iter = [0x0061u16, 0x0062u16].as_slice().chars();
225        let at_start = iter.as_slice();
226        assert_eq!(iter.next(), Some('a'));
227        let in_middle = iter.as_slice();
228        assert_eq!(iter.next(), Some('b'));
229        let at_end = iter.as_slice();
230        assert_eq!(at_start.len(), 2);
231        assert_eq!(in_middle.len(), 1);
232        assert_eq!(at_end.len(), 0);
233        assert_eq!(at_start[0], 0x0061u16);
234        assert_eq!(at_start[1], 0x0062u16);
235        assert_eq!(in_middle[0], 0x0062u16);
236    }
237}