icu_locid/parser/
mod.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5pub mod errors;
6mod langid;
7mod locale;
8
9pub use errors::ParserError;
10pub use langid::{
11    parse_language_identifier, parse_language_identifier_from_iter,
12    parse_language_identifier_with_single_variant,
13    parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter, ParserMode,
14};
15
16pub use locale::{
17    parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension,
18};
19
20#[inline]
21const fn is_separator(slice: &[u8], idx: usize) -> bool {
22    #[allow(clippy::indexing_slicing)]
23    let b = slice[idx];
24    b == b'-' || b == b'_'
25}
26
27const fn get_current_subtag(slice: &[u8], idx: usize) -> (usize, usize) {
28    debug_assert!(idx < slice.len());
29
30    // This function is called only on the idx == 0 or on a separator.
31    let (start, mut end) = if is_separator(slice, idx) {
32        // If it's a separator, set the start to idx+1 and advance the idx to the next char.
33        (idx + 1, idx + 1)
34    } else {
35        // If it's idx=0, start is 0 and end is set to 1
36        debug_assert!(idx == 0);
37        (0, 1)
38    };
39
40    while end < slice.len() && !is_separator(slice, end) {
41        // Advance until we reach end of slice or a separator.
42        end += 1;
43    }
44    // Notice: this slice may be empty (start == end) for cases like `"en-"` or `"en--US"`
45    (start, end)
46}
47
48// `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing.
49//
50// It is quite extraordinary due to focus on performance and Rust limitations for `const`
51// functions.
52//
53// The iterator is eager and fallible allowing it to reject invalid slices such as `"-"`, `"-en"`,
54// `"en-"` etc.
55//
56// The iterator provides methods available for static users - `next_manual` and `peek_manual`,
57// as well as typical `Peekable` iterator APIs - `next` and `peek`.
58//
59// All methods return an `Option` of a `Result`.
60#[derive(Copy, Clone, Debug)]
61pub struct SubtagIterator<'a> {
62    pub slice: &'a [u8],
63    done: bool,
64    // done + subtag is faster than Option<(usize, usize)>
65    // at the time of writing.
66    subtag: (usize, usize),
67}
68
69impl<'a> SubtagIterator<'a> {
70    pub const fn new(slice: &'a [u8]) -> Self {
71        let subtag = if slice.is_empty() || is_separator(slice, 0) {
72            // This returns (0, 0) which returns Some(b"") for slices like `"-en"` or `"-"`
73            (0, 0)
74        } else {
75            get_current_subtag(slice, 0)
76        };
77        Self {
78            slice,
79            done: false,
80            subtag,
81        }
82    }
83
84    pub const fn next_manual(mut self) -> (Self, Option<(usize, usize)>) {
85        if self.done {
86            return (self, None);
87        }
88        let result = self.subtag;
89        if result.1 < self.slice.len() {
90            self.subtag = get_current_subtag(self.slice, result.1);
91        } else {
92            self.done = true;
93        }
94        (self, Some(result))
95    }
96
97    pub const fn peek_manual(&self) -> Option<(usize, usize)> {
98        if self.done {
99            return None;
100        }
101        Some(self.subtag)
102    }
103
104    pub fn peek(&self) -> Option<&'a [u8]> {
105        #[allow(clippy::indexing_slicing)] // peek_manual returns valid indices
106        self.peek_manual().map(|(s, e)| &self.slice[s..e])
107    }
108}
109
110impl<'a> Iterator for SubtagIterator<'a> {
111    type Item = &'a [u8];
112
113    fn next(&mut self) -> Option<Self::Item> {
114        let (s, res) = self.next_manual();
115        *self = s;
116        #[allow(clippy::indexing_slicing)] // next_manual returns valid indices
117        res.map(|(s, e)| &self.slice[s..e])
118    }
119}
120
121#[cfg(test)]
122mod test {
123    use super::*;
124
125    fn slice_to_str(input: &[u8]) -> &str {
126        std::str::from_utf8(input).unwrap()
127    }
128
129    #[test]
130    fn subtag_iterator_peek_test() {
131        let slice = "de_at-u-ca-foobar";
132        let mut si = SubtagIterator::new(slice.as_bytes());
133
134        assert_eq!(si.peek().map(slice_to_str), Some("de"));
135        assert_eq!(si.peek().map(slice_to_str), Some("de"));
136        assert_eq!(si.next().map(slice_to_str), Some("de"));
137
138        assert_eq!(si.peek().map(slice_to_str), Some("at"));
139        assert_eq!(si.peek().map(slice_to_str), Some("at"));
140        assert_eq!(si.next().map(slice_to_str), Some("at"));
141    }
142
143    #[test]
144    fn subtag_iterator_test() {
145        let slice = "";
146        let mut si = SubtagIterator::new(slice.as_bytes());
147        assert_eq!(si.next().map(slice_to_str), Some(""));
148
149        let slice = "-";
150        let mut si = SubtagIterator::new(slice.as_bytes());
151        assert_eq!(si.next().map(slice_to_str), Some(""));
152
153        let slice = "-en";
154        let mut si = SubtagIterator::new(slice.as_bytes());
155        assert_eq!(si.next().map(slice_to_str), Some(""));
156        assert_eq!(si.next().map(slice_to_str), Some("en"));
157        assert_eq!(si.next(), None);
158
159        let slice = "en";
160        let si = SubtagIterator::new(slice.as_bytes());
161        assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en",]);
162
163        let slice = "en-";
164        let si = SubtagIterator::new(slice.as_bytes());
165        assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en", "",]);
166
167        let slice = "--";
168        let mut si = SubtagIterator::new(slice.as_bytes());
169        assert_eq!(si.next().map(slice_to_str), Some(""));
170        assert_eq!(si.next().map(slice_to_str), Some(""));
171        assert_eq!(si.next().map(slice_to_str), Some(""));
172        assert_eq!(si.next(), None);
173
174        let slice = "-en-";
175        let mut si = SubtagIterator::new(slice.as_bytes());
176        assert_eq!(si.next().map(slice_to_str), Some(""));
177        assert_eq!(si.next().map(slice_to_str), Some("en"));
178        assert_eq!(si.next().map(slice_to_str), Some(""));
179        assert_eq!(si.next(), None);
180
181        let slice = "de_at-u-ca-foobar";
182        let si = SubtagIterator::new(slice.as_bytes());
183        assert_eq!(
184            si.map(slice_to_str).collect::<Vec<_>>(),
185            vec!["de", "at", "u", "ca", "foobar",]
186        );
187    }
188
189    #[test]
190    fn get_current_subtag_test() {
191        let slice = "-";
192        let current = get_current_subtag(slice.as_bytes(), 0);
193        assert_eq!(current, (1, 1));
194
195        let slice = "-en";
196        let current = get_current_subtag(slice.as_bytes(), 0);
197        assert_eq!(current, (1, 3));
198
199        let slice = "-en-";
200        let current = get_current_subtag(slice.as_bytes(), 3);
201        assert_eq!(current, (4, 4));
202
203        let slice = "en-";
204        let current = get_current_subtag(slice.as_bytes(), 0);
205        assert_eq!(current, (0, 2));
206
207        let current = get_current_subtag(slice.as_bytes(), 2);
208        assert_eq!(current, (3, 3));
209
210        let slice = "en--US";
211        let current = get_current_subtag(slice.as_bytes(), 0);
212        assert_eq!(current, (0, 2));
213
214        let current = get_current_subtag(slice.as_bytes(), 2);
215        assert_eq!(current, (3, 3));
216
217        let current = get_current_subtag(slice.as_bytes(), 3);
218        assert_eq!(current, (4, 6));
219
220        let slice = "--";
221        let current = get_current_subtag(slice.as_bytes(), 0);
222        assert_eq!(current, (1, 1));
223
224        let current = get_current_subtag(slice.as_bytes(), 1);
225        assert_eq!(current, (2, 2));
226
227        let slice = "-";
228        let current = get_current_subtag(slice.as_bytes(), 0);
229        assert_eq!(current, (1, 1));
230    }
231}