chrono/format/
scan.rs

1// This is a part of Chrono.
2// See README.md and LICENSE.txt for details.
3
4/*!
5 * Various scanning routines for the parser.
6 */
7
8use super::{INVALID, OUT_OF_RANGE, ParseResult, TOO_SHORT};
9use crate::Weekday;
10
11/// Tries to parse the non-negative number from `min` to `max` digits.
12///
13/// The absence of digits at all is an unconditional error.
14/// More than `max` digits are consumed up to the first `max` digits.
15/// Any number that does not fit in `i64` is an error.
16#[inline]
17pub(super) fn number(s: &str, min: usize, max: usize) -> ParseResult<(&str, i64)> {
18    assert!(min <= max);
19
20    // We are only interested in ascii numbers, so we can work with the `str` as bytes. We stop on
21    // the first non-numeric byte, which may be another ascii character or beginning of multi-byte
22    // UTF-8 character.
23    let bytes = s.as_bytes();
24    if bytes.len() < min {
25        return Err(TOO_SHORT);
26    }
27
28    let mut n = 0i64;
29    for (i, c) in bytes.iter().take(max).cloned().enumerate() {
30        // cloned() = copied()
31        if !c.is_ascii_digit() {
32            if i < min {
33                return Err(INVALID);
34            } else {
35                return Ok((&s[i..], n));
36            }
37        }
38
39        n = match n.checked_mul(10).and_then(|n| n.checked_add((c - b'0') as i64)) {
40            Some(n) => n,
41            None => return Err(OUT_OF_RANGE),
42        };
43    }
44
45    Ok((&s[core::cmp::min(max, bytes.len())..], n))
46}
47
48/// Tries to consume at least one digits as a fractional second.
49/// Returns the number of whole nanoseconds (0--999,999,999).
50pub(super) fn nanosecond(s: &str) -> ParseResult<(&str, i64)> {
51    // record the number of digits consumed for later scaling.
52    let origlen = s.len();
53    let (s, v) = number(s, 1, 9)?;
54    let consumed = origlen - s.len();
55
56    // scale the number accordingly.
57    static SCALE: [i64; 10] =
58        [0, 100_000_000, 10_000_000, 1_000_000, 100_000, 10_000, 1_000, 100, 10, 1];
59    let v = v.checked_mul(SCALE[consumed]).ok_or(OUT_OF_RANGE)?;
60
61    // if there are more than 9 digits, skip next digits.
62    let s = s.trim_start_matches(|c: char| c.is_ascii_digit());
63
64    Ok((s, v))
65}
66
67/// Tries to consume a fixed number of digits as a fractional second.
68/// Returns the number of whole nanoseconds (0--999,999,999).
69pub(super) fn nanosecond_fixed(s: &str, digits: usize) -> ParseResult<(&str, i64)> {
70    // record the number of digits consumed for later scaling.
71    let (s, v) = number(s, digits, digits)?;
72
73    // scale the number accordingly.
74    static SCALE: [i64; 10] =
75        [0, 100_000_000, 10_000_000, 1_000_000, 100_000, 10_000, 1_000, 100, 10, 1];
76    let v = v.checked_mul(SCALE[digits]).ok_or(OUT_OF_RANGE)?;
77
78    Ok((s, v))
79}
80
81/// Tries to parse the month index (0 through 11) with the first three ASCII letters.
82pub(super) fn short_month0(s: &str) -> ParseResult<(&str, u8)> {
83    if s.len() < 3 {
84        return Err(TOO_SHORT);
85    }
86    let buf = s.as_bytes();
87    let month0 = match (buf[0] | 32, buf[1] | 32, buf[2] | 32) {
88        (b'j', b'a', b'n') => 0,
89        (b'f', b'e', b'b') => 1,
90        (b'm', b'a', b'r') => 2,
91        (b'a', b'p', b'r') => 3,
92        (b'm', b'a', b'y') => 4,
93        (b'j', b'u', b'n') => 5,
94        (b'j', b'u', b'l') => 6,
95        (b'a', b'u', b'g') => 7,
96        (b's', b'e', b'p') => 8,
97        (b'o', b'c', b't') => 9,
98        (b'n', b'o', b'v') => 10,
99        (b'd', b'e', b'c') => 11,
100        _ => return Err(INVALID),
101    };
102    Ok((&s[3..], month0))
103}
104
105/// Tries to parse the weekday with the first three ASCII letters.
106pub(super) fn short_weekday(s: &str) -> ParseResult<(&str, Weekday)> {
107    if s.len() < 3 {
108        return Err(TOO_SHORT);
109    }
110    let buf = s.as_bytes();
111    let weekday = match (buf[0] | 32, buf[1] | 32, buf[2] | 32) {
112        (b'm', b'o', b'n') => Weekday::Mon,
113        (b't', b'u', b'e') => Weekday::Tue,
114        (b'w', b'e', b'd') => Weekday::Wed,
115        (b't', b'h', b'u') => Weekday::Thu,
116        (b'f', b'r', b'i') => Weekday::Fri,
117        (b's', b'a', b't') => Weekday::Sat,
118        (b's', b'u', b'n') => Weekday::Sun,
119        _ => return Err(INVALID),
120    };
121    Ok((&s[3..], weekday))
122}
123
124/// Tries to parse the month index (0 through 11) with short or long month names.
125/// It prefers long month names to short month names when both are possible.
126pub(super) fn short_or_long_month0(s: &str) -> ParseResult<(&str, u8)> {
127    // lowercased month names, minus first three chars
128    static LONG_MONTH_SUFFIXES: [&[u8]; 12] = [
129        b"uary", b"ruary", b"ch", b"il", b"", b"e", b"y", b"ust", b"tember", b"ober", b"ember",
130        b"ember",
131    ];
132
133    let (mut s, month0) = short_month0(s)?;
134
135    // tries to consume the suffix if possible
136    let suffix = LONG_MONTH_SUFFIXES[month0 as usize];
137    if s.len() >= suffix.len() && s.as_bytes()[..suffix.len()].eq_ignore_ascii_case(suffix) {
138        s = &s[suffix.len()..];
139    }
140
141    Ok((s, month0))
142}
143
144/// Tries to parse the weekday with short or long weekday names.
145/// It prefers long weekday names to short weekday names when both are possible.
146pub(super) fn short_or_long_weekday(s: &str) -> ParseResult<(&str, Weekday)> {
147    // lowercased weekday names, minus first three chars
148    static LONG_WEEKDAY_SUFFIXES: [&[u8]; 7] =
149        [b"day", b"sday", b"nesday", b"rsday", b"day", b"urday", b"day"];
150
151    let (mut s, weekday) = short_weekday(s)?;
152
153    // tries to consume the suffix if possible
154    let suffix = LONG_WEEKDAY_SUFFIXES[weekday.num_days_from_monday() as usize];
155    if s.len() >= suffix.len() && s.as_bytes()[..suffix.len()].eq_ignore_ascii_case(suffix) {
156        s = &s[suffix.len()..];
157    }
158
159    Ok((s, weekday))
160}
161
162/// Tries to consume exactly one given character.
163pub(super) fn char(s: &str, c1: u8) -> ParseResult<&str> {
164    match s.as_bytes().first() {
165        Some(&c) if c == c1 => Ok(&s[1..]),
166        Some(_) => Err(INVALID),
167        None => Err(TOO_SHORT),
168    }
169}
170
171/// Tries to consume one or more whitespace.
172pub(super) fn space(s: &str) -> ParseResult<&str> {
173    let s_ = s.trim_start();
174    if s_.len() < s.len() {
175        Ok(s_)
176    } else if s.is_empty() {
177        Err(TOO_SHORT)
178    } else {
179        Err(INVALID)
180    }
181}
182
183/// Consumes any number (including zero) of colon or spaces.
184pub(crate) fn colon_or_space(s: &str) -> ParseResult<&str> {
185    Ok(s.trim_start_matches(|c: char| c == ':' || c.is_whitespace()))
186}
187
188/// Parse a timezone from `s` and return the offset in seconds.
189///
190/// The `consume_colon` function is used to parse a mandatory or optional `:`
191/// separator between hours offset and minutes offset.
192///
193/// The `allow_missing_minutes` flag allows the timezone minutes offset to be
194/// missing from `s`.
195///
196/// The `allow_tz_minus_sign` flag allows the timezone offset negative character
197/// to also be `−` MINUS SIGN (U+2212) in addition to the typical
198/// ASCII-compatible `-` HYPHEN-MINUS (U+2D).
199/// This is part of [RFC 3339 & ISO 8601].
200///
201/// [RFC 3339 & ISO 8601]: https://en.wikipedia.org/w/index.php?title=ISO_8601&oldid=1114309368#Time_offsets_from_UTC
202pub(crate) fn timezone_offset<F>(
203    mut s: &str,
204    mut consume_colon: F,
205    allow_zulu: bool,
206    allow_missing_minutes: bool,
207    allow_tz_minus_sign: bool,
208) -> ParseResult<(&str, i32)>
209where
210    F: FnMut(&str) -> ParseResult<&str>,
211{
212    if allow_zulu {
213        if let Some(&b'Z' | &b'z') = s.as_bytes().first() {
214            return Ok((&s[1..], 0));
215        }
216    }
217
218    const fn digits(s: &str) -> ParseResult<(u8, u8)> {
219        let b = s.as_bytes();
220        if b.len() < 2 { Err(TOO_SHORT) } else { Ok((b[0], b[1])) }
221    }
222    let negative = match s.chars().next() {
223        Some('+') => {
224            // PLUS SIGN (U+2B)
225            s = &s['+'.len_utf8()..];
226
227            false
228        }
229        Some('-') => {
230            // HYPHEN-MINUS (U+2D)
231            s = &s['-'.len_utf8()..];
232
233            true
234        }
235        Some('−') => {
236            // MINUS SIGN (U+2212)
237            if !allow_tz_minus_sign {
238                return Err(INVALID);
239            }
240            s = &s['−'.len_utf8()..];
241
242            true
243        }
244        Some(_) => return Err(INVALID),
245        None => return Err(TOO_SHORT),
246    };
247
248    // hours (00--99)
249    let hours = match digits(s)? {
250        (h1 @ b'0'..=b'9', h2 @ b'0'..=b'9') => i32::from((h1 - b'0') * 10 + (h2 - b'0')),
251        _ => return Err(INVALID),
252    };
253    s = &s[2..];
254
255    // colons (and possibly other separators)
256    s = consume_colon(s)?;
257
258    // minutes (00--59)
259    // if the next two items are digits then we have to add minutes
260    let minutes = if let Ok(ds) = digits(s) {
261        match ds {
262            (m1 @ b'0'..=b'5', m2 @ b'0'..=b'9') => i32::from((m1 - b'0') * 10 + (m2 - b'0')),
263            (b'6'..=b'9', b'0'..=b'9') => return Err(OUT_OF_RANGE),
264            _ => return Err(INVALID),
265        }
266    } else if allow_missing_minutes {
267        0
268    } else {
269        return Err(TOO_SHORT);
270    };
271    s = match s.len() {
272        len if len >= 2 => &s[2..],
273        0 => s,
274        _ => return Err(TOO_SHORT),
275    };
276
277    let seconds = hours * 3600 + minutes * 60;
278    Ok((s, if negative { -seconds } else { seconds }))
279}
280
281/// Same as `timezone_offset` but also allows for RFC 2822 legacy timezones.
282/// May return `None` which indicates an insufficient offset data (i.e. `-0000`).
283/// See [RFC 2822 Section 4.3].
284///
285/// [RFC 2822 Section 4.3]: https://tools.ietf.org/html/rfc2822#section-4.3
286pub(super) fn timezone_offset_2822(s: &str) -> ParseResult<(&str, i32)> {
287    // tries to parse legacy time zone names
288    let upto = s.as_bytes().iter().position(|&c| !c.is_ascii_alphabetic()).unwrap_or(s.len());
289    if upto > 0 {
290        let name = &s.as_bytes()[..upto];
291        let s = &s[upto..];
292        let offset_hours = |o| Ok((s, o * 3600));
293        // RFC 2822 requires support for some named North America timezones, a small subset of all
294        // named timezones.
295        if name.eq_ignore_ascii_case(b"gmt")
296            || name.eq_ignore_ascii_case(b"ut")
297            || name.eq_ignore_ascii_case(b"z")
298        {
299            return offset_hours(0);
300        } else if name.eq_ignore_ascii_case(b"edt") {
301            return offset_hours(-4);
302        } else if name.eq_ignore_ascii_case(b"est") || name.eq_ignore_ascii_case(b"cdt") {
303            return offset_hours(-5);
304        } else if name.eq_ignore_ascii_case(b"cst") || name.eq_ignore_ascii_case(b"mdt") {
305            return offset_hours(-6);
306        } else if name.eq_ignore_ascii_case(b"mst") || name.eq_ignore_ascii_case(b"pdt") {
307            return offset_hours(-7);
308        } else if name.eq_ignore_ascii_case(b"pst") {
309            return offset_hours(-8);
310        } else if name.len() == 1 {
311            if let b'a'..=b'i' | b'k'..=b'y' | b'A'..=b'I' | b'K'..=b'Y' = name[0] {
312                // recommended by RFC 2822: consume but treat it as -0000
313                return Ok((s, 0));
314            }
315        }
316        Err(INVALID)
317    } else {
318        timezone_offset(s, |s| Ok(s), false, false, false)
319    }
320}
321
322/// Tries to consume an RFC2822 comment including preceding ` `.
323///
324/// Returns the remaining string after the closing parenthesis.
325pub(super) fn comment_2822(s: &str) -> ParseResult<(&str, ())> {
326    use CommentState::*;
327
328    let s = s.trim_start();
329
330    let mut state = Start;
331    for (i, c) in s.bytes().enumerate() {
332        state = match (state, c) {
333            (Start, b'(') => Next(1),
334            (Next(1), b')') => return Ok((&s[i + 1..], ())),
335            (Next(depth), b'\\') => Escape(depth),
336            (Next(depth), b'(') => Next(depth + 1),
337            (Next(depth), b')') => Next(depth - 1),
338            (Next(depth), _) | (Escape(depth), _) => Next(depth),
339            _ => return Err(INVALID),
340        };
341    }
342
343    Err(TOO_SHORT)
344}
345
346enum CommentState {
347    Start,
348    Next(usize),
349    Escape(usize),
350}
351
352#[cfg(test)]
353mod tests {
354    use super::{
355        comment_2822, nanosecond, nanosecond_fixed, short_or_long_month0, short_or_long_weekday,
356        timezone_offset_2822,
357    };
358    use crate::Weekday;
359    use crate::format::{INVALID, TOO_SHORT};
360
361    #[test]
362    fn test_rfc2822_comments() {
363        let testdata = [
364            ("", Err(TOO_SHORT)),
365            (" ", Err(TOO_SHORT)),
366            ("x", Err(INVALID)),
367            ("(", Err(TOO_SHORT)),
368            ("()", Ok("")),
369            (" \r\n\t()", Ok("")),
370            ("() ", Ok(" ")),
371            ("()z", Ok("z")),
372            ("(x)", Ok("")),
373            ("(())", Ok("")),
374            ("((()))", Ok("")),
375            ("(x(x(x)x)x)", Ok("")),
376            ("( x ( x ( x ) x ) x )", Ok("")),
377            (r"(\)", Err(TOO_SHORT)),
378            (r"(\()", Ok("")),
379            (r"(\))", Ok("")),
380            (r"(\\)", Ok("")),
381            ("(()())", Ok("")),
382            ("( x ( x ) x ( x ) x )", Ok("")),
383        ];
384
385        for (test_in, expected) in testdata.iter() {
386            let actual = comment_2822(test_in).map(|(s, _)| s);
387            assert_eq!(
388                *expected, actual,
389                "{:?} expected to produce {:?}, but produced {:?}.",
390                test_in, expected, actual
391            );
392        }
393    }
394
395    #[test]
396    fn test_timezone_offset_2822() {
397        assert_eq!(timezone_offset_2822("cSt").unwrap(), ("", -21600));
398        assert_eq!(timezone_offset_2822("pSt").unwrap(), ("", -28800));
399        assert_eq!(timezone_offset_2822("mSt").unwrap(), ("", -25200));
400        assert_eq!(timezone_offset_2822("-1551").unwrap(), ("", -57060));
401        assert_eq!(timezone_offset_2822("Gp"), Err(INVALID));
402    }
403
404    #[test]
405    fn test_short_or_long_month0() {
406        assert_eq!(short_or_long_month0("JUn").unwrap(), ("", 5));
407        assert_eq!(short_or_long_month0("mAy").unwrap(), ("", 4));
408        assert_eq!(short_or_long_month0("AuG").unwrap(), ("", 7));
409        assert_eq!(short_or_long_month0("Aprâ").unwrap(), ("â", 3));
410        assert_eq!(short_or_long_month0("JUl").unwrap(), ("", 6));
411        assert_eq!(short_or_long_month0("mAr").unwrap(), ("", 2));
412        assert_eq!(short_or_long_month0("Jan").unwrap(), ("", 0));
413    }
414
415    #[test]
416    fn test_short_or_long_weekday() {
417        assert_eq!(short_or_long_weekday("sAtu").unwrap(), ("u", Weekday::Sat));
418        assert_eq!(short_or_long_weekday("thu").unwrap(), ("", Weekday::Thu));
419    }
420
421    #[test]
422    fn test_nanosecond_fixed() {
423        assert_eq!(nanosecond_fixed("", 0usize).unwrap(), ("", 0));
424        assert!(nanosecond_fixed("", 1usize).is_err());
425    }
426
427    #[test]
428    fn test_nanosecond() {
429        assert_eq!(nanosecond("2Ù").unwrap(), ("Ù", 200000000));
430        assert_eq!(nanosecond("8").unwrap(), ("", 800000000));
431    }
432}