url/
parser.rs

1// Copyright 2013-2016 The rust-url developers.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9use alloc::string::String;
10use alloc::string::ToString;
11use core::fmt::{self, Formatter, Write};
12use core::str;
13
14use crate::host::{Host, HostInternal};
15use crate::Url;
16use form_urlencoded::EncodingOverride;
17use percent_encoding::{percent_encode, utf8_percent_encode, AsciiSet, CONTROLS};
18
19/// https://url.spec.whatwg.org/#fragment-percent-encode-set
20const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`');
21
22/// https://url.spec.whatwg.org/#path-percent-encode-set
23const PATH: &AsciiSet = &FRAGMENT.add(b'#').add(b'?').add(b'{').add(b'}');
24
25/// https://url.spec.whatwg.org/#userinfo-percent-encode-set
26pub(crate) const USERINFO: &AsciiSet = &PATH
27    .add(b'/')
28    .add(b':')
29    .add(b';')
30    .add(b'=')
31    .add(b'@')
32    .add(b'[')
33    .add(b'\\')
34    .add(b']')
35    .add(b'^')
36    .add(b'|');
37
38pub(crate) const PATH_SEGMENT: &AsciiSet = &PATH.add(b'/').add(b'%');
39
40// The backslash (\) character is treated as a path separator in special URLs
41// so it needs to be additionally escaped in that case.
42pub(crate) const SPECIAL_PATH_SEGMENT: &AsciiSet = &PATH_SEGMENT.add(b'\\');
43
44// https://url.spec.whatwg.org/#query-state
45const QUERY: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'#').add(b'<').add(b'>');
46const SPECIAL_QUERY: &AsciiSet = &QUERY.add(b'\'');
47
48pub type ParseResult<T> = Result<T, ParseError>;
49
50macro_rules! simple_enum_error {
51    ($($name: ident => $description: expr,)+) => {
52        /// Errors that can occur during parsing.
53        ///
54        /// This may be extended in the future so exhaustive matching is
55        /// discouraged with an unused variant.
56        #[derive(PartialEq, Eq, Clone, Copy, Debug)]
57        #[non_exhaustive]
58        pub enum ParseError {
59            $(
60                $name,
61            )+
62        }
63
64        impl fmt::Display for ParseError {
65            fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result {
66                match *self {
67                    $(
68                        ParseError::$name => fmt.write_str($description),
69                    )+
70                }
71            }
72        }
73    }
74}
75
76#[cfg(feature = "std")]
77impl std::error::Error for ParseError {}
78
79#[cfg(not(feature = "std"))]
80impl core::error::Error for ParseError {}
81
82simple_enum_error! {
83    EmptyHost => "empty host",
84    IdnaError => "invalid international domain name",
85    InvalidPort => "invalid port number",
86    InvalidIpv4Address => "invalid IPv4 address",
87    InvalidIpv6Address => "invalid IPv6 address",
88    InvalidDomainCharacter => "invalid domain character",
89    RelativeUrlWithoutBase => "relative URL without a base",
90    RelativeUrlWithCannotBeABaseBase => "relative URL with a cannot-be-a-base base",
91    SetHostOnCannotBeABaseUrl => "a cannot-be-a-base URL doesn’t have a host to set",
92    Overflow => "URLs more than 4 GB are not supported",
93}
94
95impl From<::idna::Errors> for ParseError {
96    fn from(_: ::idna::Errors) -> ParseError {
97        ParseError::IdnaError
98    }
99}
100
101macro_rules! syntax_violation_enum {
102    ($($name: ident => $description: literal,)+) => {
103        /// Non-fatal syntax violations that can occur during parsing.
104        ///
105        /// This may be extended in the future so exhaustive matching is
106        /// forbidden.
107        #[derive(PartialEq, Eq, Clone, Copy, Debug)]
108        #[non_exhaustive]
109        pub enum SyntaxViolation {
110            $(
111                /// ```text
112                #[doc = $description]
113                /// ```
114                $name,
115            )+
116        }
117
118        impl SyntaxViolation {
119            pub fn description(&self) -> &'static str {
120                match *self {
121                    $(
122                        SyntaxViolation::$name => $description,
123                    )+
124                }
125            }
126        }
127    }
128}
129
130syntax_violation_enum! {
131    Backslash => "backslash",
132    C0SpaceIgnored =>
133        "leading or trailing control or space character are ignored in URLs",
134    EmbeddedCredentials =>
135        "embedding authentication information (username or password) \
136         in an URL is not recommended",
137    ExpectedDoubleSlash => "expected //",
138    ExpectedFileDoubleSlash => "expected // after file:",
139    FileWithHostAndWindowsDrive => "file: with host and Windows drive letter",
140    NonUrlCodePoint => "non-URL code point",
141    NullInFragment => "NULL characters are ignored in URL fragment identifiers",
142    PercentDecode => "expected 2 hex digits after %",
143    TabOrNewlineIgnored => "tabs or newlines are ignored in URLs",
144    UnencodedAtSign => "unencoded @ sign in username or password",
145}
146
147impl fmt::Display for SyntaxViolation {
148    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
149        fmt::Display::fmt(self.description(), f)
150    }
151}
152
153#[derive(Copy, Clone, PartialEq, Eq)]
154pub enum SchemeType {
155    File,
156    SpecialNotFile,
157    NotSpecial,
158}
159
160impl SchemeType {
161    pub fn is_special(&self) -> bool {
162        !matches!(*self, SchemeType::NotSpecial)
163    }
164
165    pub fn is_file(&self) -> bool {
166        matches!(*self, SchemeType::File)
167    }
168}
169
170impl<T: AsRef<str>> From<T> for SchemeType {
171    fn from(s: T) -> Self {
172        match s.as_ref() {
173            "http" | "https" | "ws" | "wss" | "ftp" => SchemeType::SpecialNotFile,
174            "file" => SchemeType::File,
175            _ => SchemeType::NotSpecial,
176        }
177    }
178}
179
180pub fn default_port(scheme: &str) -> Option<u16> {
181    match scheme {
182        "http" | "ws" => Some(80),
183        "https" | "wss" => Some(443),
184        "ftp" => Some(21),
185        _ => None,
186    }
187}
188
189#[derive(Clone, Debug)]
190pub struct Input<'i> {
191    chars: str::Chars<'i>,
192}
193
194impl<'i> Input<'i> {
195    pub fn new_no_trim(input: &'i str) -> Self {
196        Input {
197            chars: input.chars(),
198        }
199    }
200
201    pub fn new_trim_tab_and_newlines(
202        original_input: &'i str,
203        vfn: Option<&dyn Fn(SyntaxViolation)>,
204    ) -> Self {
205        let input = original_input.trim_matches(ascii_tab_or_new_line);
206        if let Some(vfn) = vfn {
207            if input.len() < original_input.len() {
208                vfn(SyntaxViolation::C0SpaceIgnored)
209            }
210            if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
211                vfn(SyntaxViolation::TabOrNewlineIgnored)
212            }
213        }
214        Input {
215            chars: input.chars(),
216        }
217    }
218
219    pub fn new_trim_c0_control_and_space(
220        original_input: &'i str,
221        vfn: Option<&dyn Fn(SyntaxViolation)>,
222    ) -> Self {
223        let input = original_input.trim_matches(c0_control_or_space);
224        if let Some(vfn) = vfn {
225            if input.len() < original_input.len() {
226                vfn(SyntaxViolation::C0SpaceIgnored)
227            }
228            if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
229                vfn(SyntaxViolation::TabOrNewlineIgnored)
230            }
231        }
232        Input {
233            chars: input.chars(),
234        }
235    }
236
237    #[inline]
238    pub fn is_empty(&self) -> bool {
239        self.clone().next().is_none()
240    }
241
242    #[inline]
243    fn starts_with<P: Pattern>(&self, p: P) -> bool {
244        p.split_prefix(&mut self.clone())
245    }
246
247    #[inline]
248    pub fn split_prefix<P: Pattern>(&self, p: P) -> Option<Self> {
249        let mut remaining = self.clone();
250        if p.split_prefix(&mut remaining) {
251            Some(remaining)
252        } else {
253            None
254        }
255    }
256
257    #[inline]
258    fn split_first(&self) -> (Option<char>, Self) {
259        let mut remaining = self.clone();
260        (remaining.next(), remaining)
261    }
262
263    #[inline]
264    fn count_matching<F: Fn(char) -> bool>(&self, f: F) -> (u32, Self) {
265        let mut count = 0;
266        let mut remaining = self.clone();
267        loop {
268            let mut input = remaining.clone();
269            if matches!(input.next(), Some(c) if f(c)) {
270                remaining = input;
271                count += 1;
272            } else {
273                return (count, remaining);
274            }
275        }
276    }
277
278    #[inline]
279    fn next_utf8(&mut self) -> Option<(char, &'i str)> {
280        loop {
281            let utf8 = self.chars.as_str();
282            match self.chars.next() {
283                Some(c) => {
284                    if !matches!(c, '\t' | '\n' | '\r') {
285                        return Some((c, &utf8[..c.len_utf8()]));
286                    }
287                }
288                None => return None,
289            }
290        }
291    }
292}
293
294pub trait Pattern {
295    fn split_prefix(self, input: &mut Input) -> bool;
296}
297
298impl Pattern for char {
299    fn split_prefix(self, input: &mut Input) -> bool {
300        input.next() == Some(self)
301    }
302}
303
304impl<'a> Pattern for &'a str {
305    fn split_prefix(self, input: &mut Input) -> bool {
306        for c in self.chars() {
307            if input.next() != Some(c) {
308                return false;
309            }
310        }
311        true
312    }
313}
314
315impl<F: FnMut(char) -> bool> Pattern for F {
316    fn split_prefix(self, input: &mut Input) -> bool {
317        input.next().map_or(false, self)
318    }
319}
320
321impl<'i> Iterator for Input<'i> {
322    type Item = char;
323    fn next(&mut self) -> Option<char> {
324        self.chars
325            .by_ref()
326            .find(|&c| !matches!(c, '\t' | '\n' | '\r'))
327    }
328}
329
330pub struct Parser<'a> {
331    pub serialization: String,
332    pub base_url: Option<&'a Url>,
333    pub query_encoding_override: EncodingOverride<'a>,
334    pub violation_fn: Option<&'a dyn Fn(SyntaxViolation)>,
335    pub context: Context,
336}
337
338#[derive(PartialEq, Eq, Copy, Clone)]
339pub enum Context {
340    UrlParser,
341    Setter,
342    PathSegmentSetter,
343}
344
345impl<'a> Parser<'a> {
346    fn log_violation(&self, v: SyntaxViolation) {
347        if let Some(f) = self.violation_fn {
348            f(v)
349        }
350    }
351
352    fn log_violation_if(&self, v: SyntaxViolation, test: impl FnOnce() -> bool) {
353        if let Some(f) = self.violation_fn {
354            if test() {
355                f(v)
356            }
357        }
358    }
359
360    pub fn for_setter(serialization: String) -> Parser<'a> {
361        Parser {
362            serialization,
363            base_url: None,
364            query_encoding_override: None,
365            violation_fn: None,
366            context: Context::Setter,
367        }
368    }
369
370    /// https://url.spec.whatwg.org/#concept-basic-url-parser
371    pub fn parse_url(mut self, input: &str) -> ParseResult<Url> {
372        let input = Input::new_trim_c0_control_and_space(input, self.violation_fn);
373        if let Ok(remaining) = self.parse_scheme(input.clone()) {
374            return self.parse_with_scheme(remaining);
375        }
376
377        // No-scheme state
378        if let Some(base_url) = self.base_url {
379            if input.starts_with('#') {
380                self.fragment_only(base_url, input)
381            } else if base_url.cannot_be_a_base() {
382                Err(ParseError::RelativeUrlWithCannotBeABaseBase)
383            } else {
384                let scheme_type = SchemeType::from(base_url.scheme());
385                if scheme_type.is_file() {
386                    self.parse_file(input, scheme_type, Some(base_url))
387                } else {
388                    self.parse_relative(input, scheme_type, base_url)
389                }
390            }
391        } else {
392            Err(ParseError::RelativeUrlWithoutBase)
393        }
394    }
395
396    pub fn parse_scheme<'i>(&mut self, mut input: Input<'i>) -> Result<Input<'i>, ()> {
397        if input.is_empty() || !input.starts_with(ascii_alpha) {
398            return Err(());
399        }
400        debug_assert!(self.serialization.is_empty());
401        while let Some(c) = input.next() {
402            match c {
403                'a'..='z' | 'A'..='Z' | '0'..='9' | '+' | '-' | '.' => {
404                    self.serialization.push(c.to_ascii_lowercase())
405                }
406                ':' => return Ok(input),
407                _ => {
408                    self.serialization.clear();
409                    return Err(());
410                }
411            }
412        }
413        // EOF before ':'
414        if self.context == Context::Setter {
415            Ok(input)
416        } else {
417            self.serialization.clear();
418            Err(())
419        }
420    }
421
422    fn parse_with_scheme(mut self, input: Input<'_>) -> ParseResult<Url> {
423        use crate::SyntaxViolation::{ExpectedDoubleSlash, ExpectedFileDoubleSlash};
424        let scheme_end = to_u32(self.serialization.len())?;
425        let scheme_type = SchemeType::from(&self.serialization);
426        self.serialization.push(':');
427        match scheme_type {
428            SchemeType::File => {
429                self.log_violation_if(ExpectedFileDoubleSlash, || !input.starts_with("//"));
430                let base_file_url = self.base_url.and_then(|base| {
431                    if base.scheme() == "file" {
432                        Some(base)
433                    } else {
434                        None
435                    }
436                });
437                self.serialization.clear();
438                self.parse_file(input, scheme_type, base_file_url)
439            }
440            SchemeType::SpecialNotFile => {
441                // special relative or authority state
442                let (slashes_count, remaining) = input.count_matching(|c| matches!(c, '/' | '\\'));
443                if let Some(base_url) = self.base_url {
444                    if slashes_count < 2
445                        && base_url.scheme() == &self.serialization[..scheme_end as usize]
446                    {
447                        // "Cannot-be-a-base" URLs only happen with "not special" schemes.
448                        debug_assert!(!base_url.cannot_be_a_base());
449                        self.serialization.clear();
450                        return self.parse_relative(input, scheme_type, base_url);
451                    }
452                }
453                // special authority slashes state
454                self.log_violation_if(ExpectedDoubleSlash, || {
455                    input
456                        .clone()
457                        .take_while(|&c| matches!(c, '/' | '\\'))
458                        .collect::<String>()
459                        != "//"
460                });
461                self.after_double_slash(remaining, scheme_type, scheme_end)
462            }
463            SchemeType::NotSpecial => self.parse_non_special(input, scheme_type, scheme_end),
464        }
465    }
466
467    /// Scheme other than file, http, https, ws, ws, ftp.
468    fn parse_non_special(
469        mut self,
470        input: Input<'_>,
471        scheme_type: SchemeType,
472        scheme_end: u32,
473    ) -> ParseResult<Url> {
474        // path or authority state (
475        if let Some(input) = input.split_prefix("//") {
476            return self.after_double_slash(input, scheme_type, scheme_end);
477        }
478        // Anarchist URL (no authority)
479        let path_start = to_u32(self.serialization.len())?;
480        let username_end = path_start;
481        let host_start = path_start;
482        let host_end = path_start;
483        let host = HostInternal::None;
484        let port = None;
485        let remaining = if let Some(input) = input.split_prefix('/') {
486            self.serialization.push('/');
487            self.parse_path(scheme_type, &mut false, path_start as usize, input)
488        } else {
489            self.parse_cannot_be_a_base_path(input)
490        };
491        self.with_query_and_fragment(
492            scheme_type,
493            scheme_end,
494            username_end,
495            host_start,
496            host_end,
497            host,
498            port,
499            path_start,
500            remaining,
501        )
502    }
503
504    fn parse_file(
505        mut self,
506        input: Input<'_>,
507        scheme_type: SchemeType,
508        base_file_url: Option<&Url>,
509    ) -> ParseResult<Url> {
510        use crate::SyntaxViolation::Backslash;
511        // file state
512        debug_assert!(self.serialization.is_empty());
513        let (first_char, input_after_first_char) = input.split_first();
514        if matches!(first_char, Some('/') | Some('\\')) {
515            self.log_violation_if(SyntaxViolation::Backslash, || first_char == Some('\\'));
516            // file slash state
517            let (next_char, input_after_next_char) = input_after_first_char.split_first();
518            if matches!(next_char, Some('/') | Some('\\')) {
519                self.log_violation_if(Backslash, || next_char == Some('\\'));
520                // file host state
521                self.serialization.push_str("file://");
522                let scheme_end = "file".len() as u32;
523                let host_start = "file://".len() as u32;
524                let (path_start, mut host, remaining) =
525                    self.parse_file_host(input_after_next_char)?;
526                let mut host_end = to_u32(self.serialization.len())?;
527                let mut has_host = !matches!(host, HostInternal::None);
528                let remaining = if path_start {
529                    self.parse_path_start(SchemeType::File, &mut has_host, remaining)
530                } else {
531                    let path_start = self.serialization.len();
532                    self.serialization.push('/');
533                    self.parse_path(SchemeType::File, &mut has_host, path_start, remaining)
534                };
535
536                // For file URLs that have a host and whose path starts
537                // with the windows drive letter we just remove the host.
538                if !has_host {
539                    self.serialization
540                        .drain(host_start as usize..host_end as usize);
541                    host_end = host_start;
542                    host = HostInternal::None;
543                }
544                let (query_start, fragment_start) =
545                    self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
546                return Ok(Url {
547                    serialization: self.serialization,
548                    scheme_end,
549                    username_end: host_start,
550                    host_start,
551                    host_end,
552                    host,
553                    port: None,
554                    path_start: host_end,
555                    query_start,
556                    fragment_start,
557                });
558            } else {
559                self.serialization.push_str("file://");
560                let scheme_end = "file".len() as u32;
561                let host_start = "file://".len();
562                let mut host_end = host_start;
563                let mut host = HostInternal::None;
564                if !starts_with_windows_drive_letter_segment(&input_after_first_char) {
565                    if let Some(base_url) = base_file_url {
566                        let first_segment = base_url.path_segments().unwrap().next().unwrap();
567                        if is_normalized_windows_drive_letter(first_segment) {
568                            self.serialization.push('/');
569                            self.serialization.push_str(first_segment);
570                        } else if let Some(host_str) = base_url.host_str() {
571                            self.serialization.push_str(host_str);
572                            host_end = self.serialization.len();
573                            host = base_url.host;
574                        }
575                    }
576                }
577                // If c is the EOF code point, U+002F (/), U+005C (\), U+003F (?), or U+0023 (#), then decrease pointer by one
578                let parse_path_input = if let Some(c) = first_char {
579                    if c == '/' || c == '\\' || c == '?' || c == '#' {
580                        input
581                    } else {
582                        input_after_first_char
583                    }
584                } else {
585                    input_after_first_char
586                };
587
588                let remaining =
589                    self.parse_path(SchemeType::File, &mut false, host_end, parse_path_input);
590
591                let host_start = host_start as u32;
592
593                let (query_start, fragment_start) =
594                    self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
595
596                let host_end = host_end as u32;
597                return Ok(Url {
598                    serialization: self.serialization,
599                    scheme_end,
600                    username_end: host_start,
601                    host_start,
602                    host_end,
603                    host,
604                    port: None,
605                    path_start: host_end,
606                    query_start,
607                    fragment_start,
608                });
609            }
610        }
611        if let Some(base_url) = base_file_url {
612            match first_char {
613                None => {
614                    // Copy everything except the fragment
615                    let before_fragment = match base_url.fragment_start {
616                        Some(i) => &base_url.serialization[..i as usize],
617                        None => &*base_url.serialization,
618                    };
619                    self.serialization.push_str(before_fragment);
620                    Ok(Url {
621                        serialization: self.serialization,
622                        fragment_start: None,
623                        ..*base_url
624                    })
625                }
626                Some('?') => {
627                    // Copy everything up to the query string
628                    let before_query = match (base_url.query_start, base_url.fragment_start) {
629                        (None, None) => &*base_url.serialization,
630                        (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
631                    };
632                    self.serialization.push_str(before_query);
633                    let (query_start, fragment_start) =
634                        self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?;
635                    Ok(Url {
636                        serialization: self.serialization,
637                        query_start,
638                        fragment_start,
639                        ..*base_url
640                    })
641                }
642                Some('#') => self.fragment_only(base_url, input),
643                _ => {
644                    if !starts_with_windows_drive_letter_segment(&input) {
645                        let before_query = match (base_url.query_start, base_url.fragment_start) {
646                            (None, None) => &*base_url.serialization,
647                            (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
648                        };
649                        self.serialization.push_str(before_query);
650                        self.shorten_path(SchemeType::File, base_url.path_start as usize);
651                        let remaining = self.parse_path(
652                            SchemeType::File,
653                            &mut true,
654                            base_url.path_start as usize,
655                            input,
656                        );
657                        self.with_query_and_fragment(
658                            SchemeType::File,
659                            base_url.scheme_end,
660                            base_url.username_end,
661                            base_url.host_start,
662                            base_url.host_end,
663                            base_url.host,
664                            base_url.port,
665                            base_url.path_start,
666                            remaining,
667                        )
668                    } else {
669                        self.serialization.push_str("file:///");
670                        let scheme_end = "file".len() as u32;
671                        let path_start = "file://".len();
672                        let remaining =
673                            self.parse_path(SchemeType::File, &mut false, path_start, input);
674                        let (query_start, fragment_start) =
675                            self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?;
676                        let path_start = path_start as u32;
677                        Ok(Url {
678                            serialization: self.serialization,
679                            scheme_end,
680                            username_end: path_start,
681                            host_start: path_start,
682                            host_end: path_start,
683                            host: HostInternal::None,
684                            port: None,
685                            path_start,
686                            query_start,
687                            fragment_start,
688                        })
689                    }
690                }
691            }
692        } else {
693            self.serialization.push_str("file:///");
694            let scheme_end = "file".len() as u32;
695            let path_start = "file://".len();
696            let remaining = self.parse_path(SchemeType::File, &mut false, path_start, input);
697            let (query_start, fragment_start) =
698                self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?;
699            let path_start = path_start as u32;
700            Ok(Url {
701                serialization: self.serialization,
702                scheme_end,
703                username_end: path_start,
704                host_start: path_start,
705                host_end: path_start,
706                host: HostInternal::None,
707                port: None,
708                path_start,
709                query_start,
710                fragment_start,
711            })
712        }
713    }
714
715    fn parse_relative(
716        mut self,
717        input: Input<'_>,
718        scheme_type: SchemeType,
719        base_url: &Url,
720    ) -> ParseResult<Url> {
721        // relative state
722        debug_assert!(self.serialization.is_empty());
723        let (first_char, input_after_first_char) = input.split_first();
724        match first_char {
725            None => {
726                // Copy everything except the fragment
727                let before_fragment = match base_url.fragment_start {
728                    Some(i) => &base_url.serialization[..i as usize],
729                    None => &*base_url.serialization,
730                };
731                self.serialization.push_str(before_fragment);
732                Ok(Url {
733                    serialization: self.serialization,
734                    fragment_start: None,
735                    ..*base_url
736                })
737            }
738            Some('?') => {
739                // Copy everything up to the query string
740                let before_query = match (base_url.query_start, base_url.fragment_start) {
741                    (None, None) => &*base_url.serialization,
742                    (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
743                };
744                self.serialization.push_str(before_query);
745                let (query_start, fragment_start) =
746                    self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?;
747                Ok(Url {
748                    serialization: self.serialization,
749                    query_start,
750                    fragment_start,
751                    ..*base_url
752                })
753            }
754            Some('#') => self.fragment_only(base_url, input),
755            Some('/') | Some('\\') => {
756                let (slashes_count, remaining) = input.count_matching(|c| matches!(c, '/' | '\\'));
757                if slashes_count >= 2 {
758                    self.log_violation_if(SyntaxViolation::ExpectedDoubleSlash, || {
759                        input
760                            .clone()
761                            .take_while(|&c| matches!(c, '/' | '\\'))
762                            .collect::<String>()
763                            != "//"
764                    });
765                    let scheme_end = base_url.scheme_end;
766                    debug_assert!(base_url.byte_at(scheme_end) == b':');
767                    self.serialization
768                        .push_str(base_url.slice(..scheme_end + 1));
769                    if let Some(after_prefix) = input.split_prefix("//") {
770                        return self.after_double_slash(after_prefix, scheme_type, scheme_end);
771                    }
772                    return self.after_double_slash(remaining, scheme_type, scheme_end);
773                }
774                let path_start = base_url.path_start;
775                self.serialization.push_str(base_url.slice(..path_start));
776                self.serialization.push('/');
777                let remaining = self.parse_path(
778                    scheme_type,
779                    &mut true,
780                    path_start as usize,
781                    input_after_first_char,
782                );
783                self.with_query_and_fragment(
784                    scheme_type,
785                    base_url.scheme_end,
786                    base_url.username_end,
787                    base_url.host_start,
788                    base_url.host_end,
789                    base_url.host,
790                    base_url.port,
791                    base_url.path_start,
792                    remaining,
793                )
794            }
795            _ => {
796                let before_query = match (base_url.query_start, base_url.fragment_start) {
797                    (None, None) => &*base_url.serialization,
798                    (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
799                };
800                self.serialization.push_str(before_query);
801                // FIXME spec says just "remove last entry", not the "pop" algorithm
802                self.pop_path(scheme_type, base_url.path_start as usize);
803                // A special url always has a path.
804                // A path always starts with '/'
805                if self.serialization.len() == base_url.path_start as usize
806                    && (SchemeType::from(base_url.scheme()).is_special() || !input.is_empty())
807                {
808                    self.serialization.push('/');
809                }
810                let remaining = match input.split_first() {
811                    (Some('/'), remaining) => self.parse_path(
812                        scheme_type,
813                        &mut true,
814                        base_url.path_start as usize,
815                        remaining,
816                    ),
817                    _ => {
818                        self.parse_path(scheme_type, &mut true, base_url.path_start as usize, input)
819                    }
820                };
821                self.with_query_and_fragment(
822                    scheme_type,
823                    base_url.scheme_end,
824                    base_url.username_end,
825                    base_url.host_start,
826                    base_url.host_end,
827                    base_url.host,
828                    base_url.port,
829                    base_url.path_start,
830                    remaining,
831                )
832            }
833        }
834    }
835
836    fn after_double_slash(
837        mut self,
838        input: Input<'_>,
839        scheme_type: SchemeType,
840        scheme_end: u32,
841    ) -> ParseResult<Url> {
842        self.serialization.push('/');
843        self.serialization.push('/');
844        // authority state
845        let before_authority = self.serialization.len();
846        let (username_end, remaining) = self.parse_userinfo(input, scheme_type)?;
847        let has_authority = before_authority != self.serialization.len();
848        // host state
849        let host_start = to_u32(self.serialization.len())?;
850        let (host_end, host, port, remaining) =
851            self.parse_host_and_port(remaining, scheme_end, scheme_type)?;
852        if host == HostInternal::None && has_authority {
853            return Err(ParseError::EmptyHost);
854        }
855        // path state
856        let path_start = to_u32(self.serialization.len())?;
857        let remaining = self.parse_path_start(scheme_type, &mut true, remaining);
858        self.with_query_and_fragment(
859            scheme_type,
860            scheme_end,
861            username_end,
862            host_start,
863            host_end,
864            host,
865            port,
866            path_start,
867            remaining,
868        )
869    }
870
871    /// Return (username_end, remaining)
872    fn parse_userinfo<'i>(
873        &mut self,
874        mut input: Input<'i>,
875        scheme_type: SchemeType,
876    ) -> ParseResult<(u32, Input<'i>)> {
877        let mut last_at = None;
878        let mut remaining = input.clone();
879        let mut char_count = 0;
880        while let Some(c) = remaining.next() {
881            match c {
882                '@' => {
883                    if last_at.is_some() {
884                        self.log_violation(SyntaxViolation::UnencodedAtSign)
885                    } else {
886                        self.log_violation(SyntaxViolation::EmbeddedCredentials)
887                    }
888                    last_at = Some((char_count, remaining.clone()))
889                }
890                '/' | '?' | '#' => break,
891                '\\' if scheme_type.is_special() => break,
892                _ => (),
893            }
894            char_count += 1;
895        }
896        let (mut userinfo_char_count, remaining) = match last_at {
897            None => return Ok((to_u32(self.serialization.len())?, input)),
898            Some((0, remaining)) => {
899                // Otherwise, if one of the following is true
900                // c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
901                // url is special and c is U+005C (\)
902                // If @ flag is set and buffer is the empty string, validation error, return failure.
903                if let (Some(c), _) = remaining.split_first() {
904                    if c == '/' || c == '?' || c == '#' || (scheme_type.is_special() && c == '\\') {
905                        return Err(ParseError::EmptyHost);
906                    }
907                }
908                return Ok((to_u32(self.serialization.len())?, remaining));
909            }
910            Some(x) => x,
911        };
912
913        let mut username_end = None;
914        let mut has_password = false;
915        let mut has_username = false;
916        while userinfo_char_count > 0 {
917            let (c, utf8_c) = input.next_utf8().unwrap();
918            userinfo_char_count -= 1;
919            if c == ':' && username_end.is_none() {
920                // Start parsing password
921                username_end = Some(to_u32(self.serialization.len())?);
922                // We don't add a colon if the password is empty
923                if userinfo_char_count > 0 {
924                    self.serialization.push(':');
925                    has_password = true;
926                }
927            } else {
928                if !has_password {
929                    has_username = true;
930                }
931                self.check_url_code_point(c, &input);
932                self.serialization
933                    .extend(utf8_percent_encode(utf8_c, USERINFO));
934            }
935        }
936        let username_end = match username_end {
937            Some(i) => i,
938            None => to_u32(self.serialization.len())?,
939        };
940        if has_username || has_password {
941            self.serialization.push('@');
942        }
943        Ok((username_end, remaining))
944    }
945
946    fn parse_host_and_port<'i>(
947        &mut self,
948        input: Input<'i>,
949        scheme_end: u32,
950        scheme_type: SchemeType,
951    ) -> ParseResult<(u32, HostInternal, Option<u16>, Input<'i>)> {
952        let (host, remaining) = Parser::parse_host(input, scheme_type)?;
953        write!(&mut self.serialization, "{}", host).unwrap();
954        let host_end = to_u32(self.serialization.len())?;
955        if let Host::Domain(h) = &host {
956            if h.is_empty() {
957                // Port with an empty host
958                if remaining.starts_with(":") {
959                    return Err(ParseError::EmptyHost);
960                }
961                if scheme_type.is_special() {
962                    return Err(ParseError::EmptyHost);
963                }
964            }
965        };
966
967        let (port, remaining) = if let Some(remaining) = remaining.split_prefix(':') {
968            let scheme = || default_port(&self.serialization[..scheme_end as usize]);
969            Parser::parse_port(remaining, scheme, self.context)?
970        } else {
971            (None, remaining)
972        };
973        if let Some(port) = port {
974            write!(&mut self.serialization, ":{}", port).unwrap()
975        }
976        Ok((host_end, host.into(), port, remaining))
977    }
978
979    pub fn parse_host(
980        mut input: Input<'_>,
981        scheme_type: SchemeType,
982    ) -> ParseResult<(Host<String>, Input<'_>)> {
983        if scheme_type.is_file() {
984            return Parser::get_file_host(input);
985        }
986        // Undo the Input abstraction here to avoid allocating in the common case
987        // where the host part of the input does not contain any tab or newline
988        let input_str = input.chars.as_str();
989        let mut inside_square_brackets = false;
990        let mut has_ignored_chars = false;
991        let mut non_ignored_chars = 0;
992        let mut bytes = 0;
993        for c in input_str.chars() {
994            match c {
995                ':' if !inside_square_brackets => break,
996                '\\' if scheme_type.is_special() => break,
997                '/' | '?' | '#' => break,
998                '\t' | '\n' | '\r' => {
999                    has_ignored_chars = true;
1000                }
1001                '[' => {
1002                    inside_square_brackets = true;
1003                    non_ignored_chars += 1
1004                }
1005                ']' => {
1006                    inside_square_brackets = false;
1007                    non_ignored_chars += 1
1008                }
1009                _ => non_ignored_chars += 1,
1010            }
1011            bytes += c.len_utf8();
1012        }
1013        let replaced: String;
1014        let host_str;
1015        {
1016            let host_input = input.by_ref().take(non_ignored_chars);
1017            if has_ignored_chars {
1018                replaced = host_input.collect();
1019                host_str = &*replaced
1020            } else {
1021                for _ in host_input {}
1022                host_str = &input_str[..bytes]
1023            }
1024        }
1025        if scheme_type == SchemeType::SpecialNotFile && host_str.is_empty() {
1026            return Err(ParseError::EmptyHost);
1027        }
1028        if !scheme_type.is_special() {
1029            let host = Host::parse_opaque(host_str)?;
1030            return Ok((host, input));
1031        }
1032        let host = Host::parse(host_str)?;
1033        Ok((host, input))
1034    }
1035
1036    fn get_file_host(input: Input<'_>) -> ParseResult<(Host<String>, Input<'_>)> {
1037        let (_, host_str, remaining) = Parser::file_host(input)?;
1038        let host = match Host::parse(&host_str)? {
1039            Host::Domain(ref d) if d == "localhost" => Host::Domain("".to_string()),
1040            host => host,
1041        };
1042        Ok((host, remaining))
1043    }
1044
1045    fn parse_file_host<'i>(
1046        &mut self,
1047        input: Input<'i>,
1048    ) -> ParseResult<(bool, HostInternal, Input<'i>)> {
1049        let has_host;
1050        let (_, host_str, remaining) = Parser::file_host(input)?;
1051        let host = if host_str.is_empty() {
1052            has_host = false;
1053            HostInternal::None
1054        } else {
1055            match Host::parse(&host_str)? {
1056                Host::Domain(ref d) if d == "localhost" => {
1057                    has_host = false;
1058                    HostInternal::None
1059                }
1060                host => {
1061                    write!(&mut self.serialization, "{}", host).unwrap();
1062                    has_host = true;
1063                    host.into()
1064                }
1065            }
1066        };
1067        Ok((has_host, host, remaining))
1068    }
1069
1070    pub fn file_host(input: Input) -> ParseResult<(bool, String, Input)> {
1071        // Undo the Input abstraction here to avoid allocating in the common case
1072        // where the host part of the input does not contain any tab or newline
1073        let input_str = input.chars.as_str();
1074        let mut has_ignored_chars = false;
1075        let mut non_ignored_chars = 0;
1076        let mut bytes = 0;
1077        for c in input_str.chars() {
1078            match c {
1079                '/' | '\\' | '?' | '#' => break,
1080                '\t' | '\n' | '\r' => has_ignored_chars = true,
1081                _ => non_ignored_chars += 1,
1082            }
1083            bytes += c.len_utf8();
1084        }
1085        let replaced: String;
1086        let host_str;
1087        let mut remaining = input.clone();
1088        {
1089            let host_input = remaining.by_ref().take(non_ignored_chars);
1090            if has_ignored_chars {
1091                replaced = host_input.collect();
1092                host_str = &*replaced
1093            } else {
1094                for _ in host_input {}
1095                host_str = &input_str[..bytes]
1096            }
1097        }
1098        if is_windows_drive_letter(host_str) {
1099            return Ok((false, "".to_string(), input));
1100        }
1101        Ok((true, host_str.to_string(), remaining))
1102    }
1103
1104    pub fn parse_port<P>(
1105        mut input: Input<'_>,
1106        default_port: P,
1107        context: Context,
1108    ) -> ParseResult<(Option<u16>, Input<'_>)>
1109    where
1110        P: Fn() -> Option<u16>,
1111    {
1112        let mut port: u32 = 0;
1113        let mut has_any_digit = false;
1114        while let (Some(c), remaining) = input.split_first() {
1115            if let Some(digit) = c.to_digit(10) {
1116                port = port * 10 + digit;
1117                if port > u16::MAX as u32 {
1118                    return Err(ParseError::InvalidPort);
1119                }
1120                has_any_digit = true;
1121            } else if context == Context::UrlParser && !matches!(c, '/' | '\\' | '?' | '#') {
1122                return Err(ParseError::InvalidPort);
1123            } else {
1124                break;
1125            }
1126            input = remaining;
1127        }
1128
1129        if !has_any_digit && context == Context::Setter && !input.is_empty() {
1130            return Err(ParseError::InvalidPort);
1131        }
1132
1133        let mut opt_port = Some(port as u16);
1134        if !has_any_digit || opt_port == default_port() {
1135            opt_port = None;
1136        }
1137        Ok((opt_port, input))
1138    }
1139
1140    pub fn parse_path_start<'i>(
1141        &mut self,
1142        scheme_type: SchemeType,
1143        has_host: &mut bool,
1144        input: Input<'i>,
1145    ) -> Input<'i> {
1146        let path_start = self.serialization.len();
1147        let (maybe_c, remaining) = input.split_first();
1148        // If url is special, then:
1149        if scheme_type.is_special() {
1150            if maybe_c == Some('\\') {
1151                // If c is U+005C (\), validation error.
1152                self.log_violation(SyntaxViolation::Backslash);
1153            }
1154            // A special URL always has a non-empty path.
1155            if !self.serialization.ends_with('/') {
1156                self.serialization.push('/');
1157                // We have already made sure the forward slash is present.
1158                if maybe_c == Some('/') || maybe_c == Some('\\') {
1159                    return self.parse_path(scheme_type, has_host, path_start, remaining);
1160                }
1161            }
1162            return self.parse_path(scheme_type, has_host, path_start, input);
1163        } else if maybe_c == Some('?') || maybe_c == Some('#') {
1164            // Otherwise, if state override is not given and c is U+003F (?),
1165            // set url’s query to the empty string and state to query state.
1166            // Otherwise, if state override is not given and c is U+0023 (#),
1167            // set url’s fragment to the empty string and state to fragment state.
1168            // The query and path states will be handled by the caller.
1169            return input;
1170        }
1171
1172        if maybe_c.is_some() && maybe_c != Some('/') {
1173            self.serialization.push('/');
1174        }
1175        // Otherwise, if c is not the EOF code point:
1176        self.parse_path(scheme_type, has_host, path_start, input)
1177    }
1178
1179    pub fn parse_path<'i>(
1180        &mut self,
1181        scheme_type: SchemeType,
1182        has_host: &mut bool,
1183        path_start: usize,
1184        mut input: Input<'i>,
1185    ) -> Input<'i> {
1186        // Relative path state
1187        loop {
1188            let mut segment_start = self.serialization.len();
1189            let mut ends_with_slash = false;
1190            loop {
1191                let input_before_c = input.clone();
1192                let (c, utf8_c) = if let Some(x) = input.next_utf8() {
1193                    x
1194                } else {
1195                    break;
1196                };
1197                match c {
1198                    '/' if self.context != Context::PathSegmentSetter => {
1199                        self.serialization.push(c);
1200                        ends_with_slash = true;
1201                        break;
1202                    }
1203                    '\\' if self.context != Context::PathSegmentSetter
1204                        && scheme_type.is_special() =>
1205                    {
1206                        self.log_violation(SyntaxViolation::Backslash);
1207                        self.serialization.push('/');
1208                        ends_with_slash = true;
1209                        break;
1210                    }
1211                    '?' | '#' if self.context == Context::UrlParser => {
1212                        input = input_before_c;
1213                        break;
1214                    }
1215                    _ => {
1216                        self.check_url_code_point(c, &input);
1217                        if scheme_type.is_file()
1218                            && self.serialization.len() > path_start
1219                            && is_normalized_windows_drive_letter(
1220                                &self.serialization[path_start + 1..],
1221                            )
1222                        {
1223                            self.serialization.push('/');
1224                            segment_start += 1;
1225                        }
1226                        if self.context == Context::PathSegmentSetter {
1227                            if scheme_type.is_special() {
1228                                self.serialization
1229                                    .extend(utf8_percent_encode(utf8_c, SPECIAL_PATH_SEGMENT));
1230                            } else {
1231                                self.serialization
1232                                    .extend(utf8_percent_encode(utf8_c, PATH_SEGMENT));
1233                            }
1234                        } else {
1235                            self.serialization.extend(utf8_percent_encode(utf8_c, PATH));
1236                        }
1237                    }
1238                }
1239            }
1240            let segment_before_slash = if ends_with_slash {
1241                &self.serialization[segment_start..self.serialization.len() - 1]
1242            } else {
1243                &self.serialization[segment_start..self.serialization.len()]
1244            };
1245            match segment_before_slash {
1246                // If buffer is a double-dot path segment, shorten url’s path,
1247                ".." | "%2e%2e" | "%2e%2E" | "%2E%2e" | "%2E%2E" | "%2e." | "%2E." | ".%2e"
1248                | ".%2E" => {
1249                    debug_assert!(self.serialization.as_bytes()[segment_start - 1] == b'/');
1250                    self.serialization.truncate(segment_start);
1251                    if self.serialization.ends_with('/')
1252                        && Parser::last_slash_can_be_removed(&self.serialization, path_start)
1253                    {
1254                        self.serialization.pop();
1255                    }
1256                    self.shorten_path(scheme_type, path_start);
1257
1258                    // and then if neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’s path.
1259                    if ends_with_slash && !self.serialization.ends_with('/') {
1260                        self.serialization.push('/');
1261                    }
1262                }
1263                // Otherwise, if buffer is a single-dot path segment and if neither c is U+002F (/),
1264                // nor url is special and c is U+005C (\), append the empty string to url’s path.
1265                "." | "%2e" | "%2E" => {
1266                    self.serialization.truncate(segment_start);
1267                    if !self.serialization.ends_with('/') {
1268                        self.serialization.push('/');
1269                    }
1270                }
1271                _ => {
1272                    // If url’s scheme is "file", url’s path is empty, and buffer is a Windows drive letter, then
1273                    if scheme_type.is_file()
1274                        && segment_start == path_start + 1
1275                        && is_windows_drive_letter(segment_before_slash)
1276                    {
1277                        // Replace the second code point in buffer with U+003A (:).
1278                        if let Some(c) = segment_before_slash.chars().next() {
1279                            self.serialization.truncate(segment_start);
1280                            self.serialization.push(c);
1281                            self.serialization.push(':');
1282                            if ends_with_slash {
1283                                self.serialization.push('/');
1284                            }
1285                        }
1286                        // If url’s host is neither the empty string nor null,
1287                        // validation error, set url’s host to the empty string.
1288                        if *has_host {
1289                            self.log_violation(SyntaxViolation::FileWithHostAndWindowsDrive);
1290                            *has_host = false; // FIXME account for this in callers
1291                        }
1292                    }
1293                }
1294            }
1295            if !ends_with_slash {
1296                break;
1297            }
1298        }
1299        if scheme_type.is_file() {
1300            // while url’s path’s size is greater than 1
1301            // and url’s path[0] is the empty string,
1302            // validation error, remove the first item from url’s path.
1303            //FIXME: log violation
1304            let path = self.serialization.split_off(path_start);
1305            self.serialization.push('/');
1306            self.serialization.push_str(path.trim_start_matches('/'));
1307        }
1308
1309        input
1310    }
1311
1312    fn last_slash_can_be_removed(serialization: &str, path_start: usize) -> bool {
1313        let url_before_segment = &serialization[..serialization.len() - 1];
1314        if let Some(segment_before_start) = url_before_segment.rfind('/') {
1315            // Do not remove the root slash
1316            segment_before_start >= path_start
1317                // Or a windows drive letter slash
1318                && !path_starts_with_windows_drive_letter(&serialization[segment_before_start..])
1319        } else {
1320            false
1321        }
1322    }
1323
1324    /// https://url.spec.whatwg.org/#shorten-a-urls-path
1325    fn shorten_path(&mut self, scheme_type: SchemeType, path_start: usize) {
1326        // If path is empty, then return.
1327        if self.serialization.len() == path_start {
1328            return;
1329        }
1330        // If url’s scheme is "file", path’s size is 1, and path[0] is a normalized Windows drive letter, then return.
1331        if scheme_type.is_file()
1332            && is_normalized_windows_drive_letter(&self.serialization[path_start..])
1333        {
1334            return;
1335        }
1336        // Remove path’s last item.
1337        self.pop_path(scheme_type, path_start);
1338    }
1339
1340    /// https://url.spec.whatwg.org/#pop-a-urls-path
1341    fn pop_path(&mut self, scheme_type: SchemeType, path_start: usize) {
1342        if self.serialization.len() > path_start {
1343            let slash_position = self.serialization[path_start..].rfind('/').unwrap();
1344            // + 1 since rfind returns the position before the slash.
1345            let segment_start = path_start + slash_position + 1;
1346            // Don’t pop a Windows drive letter
1347            if !(scheme_type.is_file()
1348                && is_normalized_windows_drive_letter(&self.serialization[segment_start..]))
1349            {
1350                self.serialization.truncate(segment_start);
1351            }
1352        }
1353    }
1354
1355    pub fn parse_cannot_be_a_base_path<'i>(&mut self, mut input: Input<'i>) -> Input<'i> {
1356        loop {
1357            let input_before_c = input.clone();
1358            match input.next_utf8() {
1359                Some(('?', _)) | Some(('#', _)) if self.context == Context::UrlParser => {
1360                    return input_before_c
1361                }
1362                Some((c, utf8_c)) => {
1363                    self.check_url_code_point(c, &input);
1364                    self.serialization
1365                        .extend(utf8_percent_encode(utf8_c, CONTROLS));
1366                }
1367                None => return input,
1368            }
1369        }
1370    }
1371
1372    #[allow(clippy::too_many_arguments)]
1373    fn with_query_and_fragment(
1374        mut self,
1375        scheme_type: SchemeType,
1376        scheme_end: u32,
1377        username_end: u32,
1378        host_start: u32,
1379        host_end: u32,
1380        host: HostInternal,
1381        port: Option<u16>,
1382        mut path_start: u32,
1383        remaining: Input<'_>,
1384    ) -> ParseResult<Url> {
1385        // Special case for anarchist URL's with a leading empty path segment
1386        // This prevents web+demo:/.//not-a-host/ or web+demo:/path/..//not-a-host/,
1387        // when parsed and then serialized, from ending up as web+demo://not-a-host/
1388        // (they end up as web+demo:/.//not-a-host/).
1389        //
1390        // If url’s host is null, url does not have an opaque path,
1391        // url’s path’s size is greater than 1, and url’s path[0] is the empty string,
1392        // then append U+002F (/) followed by U+002E (.) to output.
1393        let scheme_end_as_usize = scheme_end as usize;
1394        let path_start_as_usize = path_start as usize;
1395        if path_start_as_usize == scheme_end_as_usize + 1 {
1396            // Anarchist URL
1397            if self.serialization[path_start_as_usize..].starts_with("//") {
1398                // Case 1: The base URL did not have an empty path segment, but the resulting one does
1399                // Insert the "/." prefix
1400                self.serialization.insert_str(path_start_as_usize, "/.");
1401                path_start += 2;
1402            }
1403            assert!(!self.serialization[scheme_end_as_usize..].starts_with("://"));
1404        } else if path_start_as_usize == scheme_end_as_usize + 3
1405            && &self.serialization[scheme_end_as_usize..path_start_as_usize] == ":/."
1406        {
1407            // Anarchist URL with leading empty path segment
1408            // The base URL has a "/." between the host and the path
1409            assert_eq!(self.serialization.as_bytes()[path_start_as_usize], b'/');
1410            if self
1411                .serialization
1412                .as_bytes()
1413                .get(path_start_as_usize + 1)
1414                .copied()
1415                != Some(b'/')
1416            {
1417                // Case 2: The base URL had an empty path segment, but the resulting one does not
1418                // Remove the "/." prefix
1419                self.serialization
1420                    .replace_range(scheme_end_as_usize..path_start_as_usize, ":");
1421                path_start -= 2;
1422            }
1423            assert!(!self.serialization[scheme_end_as_usize..].starts_with("://"));
1424        }
1425
1426        let (query_start, fragment_start) =
1427            self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
1428        Ok(Url {
1429            serialization: self.serialization,
1430            scheme_end,
1431            username_end,
1432            host_start,
1433            host_end,
1434            host,
1435            port,
1436            path_start,
1437            query_start,
1438            fragment_start,
1439        })
1440    }
1441
1442    /// Return (query_start, fragment_start)
1443    fn parse_query_and_fragment(
1444        &mut self,
1445        scheme_type: SchemeType,
1446        scheme_end: u32,
1447        mut input: Input<'_>,
1448    ) -> ParseResult<(Option<u32>, Option<u32>)> {
1449        let mut query_start = None;
1450        match input.next() {
1451            Some('#') => {}
1452            Some('?') => {
1453                query_start = Some(to_u32(self.serialization.len())?);
1454                self.serialization.push('?');
1455                let remaining = self.parse_query(scheme_type, scheme_end, input);
1456                if let Some(remaining) = remaining {
1457                    input = remaining
1458                } else {
1459                    return Ok((query_start, None));
1460                }
1461            }
1462            None => return Ok((None, None)),
1463            _ => panic!("Programming error. parse_query_and_fragment() called without ? or #"),
1464        }
1465
1466        let fragment_start = to_u32(self.serialization.len())?;
1467        self.serialization.push('#');
1468        self.parse_fragment(input);
1469        Ok((query_start, Some(fragment_start)))
1470    }
1471
1472    pub fn parse_query<'i>(
1473        &mut self,
1474        scheme_type: SchemeType,
1475        scheme_end: u32,
1476        mut input: Input<'i>,
1477    ) -> Option<Input<'i>> {
1478        let len = input.chars.as_str().len();
1479        let mut query = String::with_capacity(len); // FIXME: use a streaming decoder instead
1480        let mut remaining = None;
1481        while let Some(c) = input.next() {
1482            if c == '#' && self.context == Context::UrlParser {
1483                remaining = Some(input);
1484                break;
1485            } else {
1486                self.check_url_code_point(c, &input);
1487                query.push(c);
1488            }
1489        }
1490
1491        let encoding = match &self.serialization[..scheme_end as usize] {
1492            "http" | "https" | "file" | "ftp" => self.query_encoding_override,
1493            _ => None,
1494        };
1495        let query_bytes = if let Some(o) = encoding {
1496            o(&query)
1497        } else {
1498            query.as_bytes().into()
1499        };
1500        let set = if scheme_type.is_special() {
1501            SPECIAL_QUERY
1502        } else {
1503            QUERY
1504        };
1505        self.serialization.extend(percent_encode(&query_bytes, set));
1506        remaining
1507    }
1508
1509    fn fragment_only(mut self, base_url: &Url, mut input: Input<'_>) -> ParseResult<Url> {
1510        let before_fragment = match base_url.fragment_start {
1511            Some(i) => base_url.slice(..i),
1512            None => &*base_url.serialization,
1513        };
1514        debug_assert!(self.serialization.is_empty());
1515        self.serialization
1516            .reserve(before_fragment.len() + input.chars.as_str().len());
1517        self.serialization.push_str(before_fragment);
1518        self.serialization.push('#');
1519        let next = input.next();
1520        debug_assert!(next == Some('#'));
1521        self.parse_fragment(input);
1522        Ok(Url {
1523            serialization: self.serialization,
1524            fragment_start: Some(to_u32(before_fragment.len())?),
1525            ..*base_url
1526        })
1527    }
1528
1529    pub fn parse_fragment(&mut self, mut input: Input<'_>) {
1530        while let Some((c, utf8_c)) = input.next_utf8() {
1531            if c == '\0' {
1532                self.log_violation(SyntaxViolation::NullInFragment)
1533            } else {
1534                self.check_url_code_point(c, &input);
1535            }
1536            self.serialization
1537                .extend(utf8_percent_encode(utf8_c, FRAGMENT));
1538        }
1539    }
1540
1541    fn check_url_code_point(&self, c: char, input: &Input<'_>) {
1542        if let Some(vfn) = self.violation_fn {
1543            if c == '%' {
1544                let mut input = input.clone();
1545                if !matches!((input.next(), input.next()), (Some(a), Some(b))
1546                             if a.is_ascii_hexdigit() && b.is_ascii_hexdigit())
1547                {
1548                    vfn(SyntaxViolation::PercentDecode)
1549                }
1550            } else if !is_url_code_point(c) {
1551                vfn(SyntaxViolation::NonUrlCodePoint)
1552            }
1553        }
1554    }
1555}
1556
1557// Non URL code points:
1558// U+0000 to U+0020 (space)
1559// " # % < > [ \ ] ^ ` { | }
1560// U+007F to U+009F
1561// surrogates
1562// U+FDD0 to U+FDEF
1563// Last two of each plane: U+__FFFE to U+__FFFF for __ in 00 to 10 hex
1564#[inline]
1565fn is_url_code_point(c: char) -> bool {
1566    matches!(c,
1567        'a'..='z' |
1568        'A'..='Z' |
1569        '0'..='9' |
1570        '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | '-' |
1571        '.' | '/' | ':' | ';' | '=' | '?' | '@' | '_' | '~' |
1572        '\u{A0}'..='\u{D7FF}' | '\u{E000}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' |
1573        '\u{10000}'..='\u{1FFFD}' | '\u{20000}'..='\u{2FFFD}' |
1574        '\u{30000}'..='\u{3FFFD}' | '\u{40000}'..='\u{4FFFD}' |
1575        '\u{50000}'..='\u{5FFFD}' | '\u{60000}'..='\u{6FFFD}' |
1576        '\u{70000}'..='\u{7FFFD}' | '\u{80000}'..='\u{8FFFD}' |
1577        '\u{90000}'..='\u{9FFFD}' | '\u{A0000}'..='\u{AFFFD}' |
1578        '\u{B0000}'..='\u{BFFFD}' | '\u{C0000}'..='\u{CFFFD}' |
1579        '\u{D0000}'..='\u{DFFFD}' | '\u{E1000}'..='\u{EFFFD}' |
1580        '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}')
1581}
1582
1583/// https://url.spec.whatwg.org/#c0-controls-and-space
1584#[inline]
1585fn c0_control_or_space(ch: char) -> bool {
1586    ch <= ' ' // U+0000 to U+0020
1587}
1588
1589/// https://infra.spec.whatwg.org/#ascii-tab-or-newline
1590#[inline]
1591fn ascii_tab_or_new_line(ch: char) -> bool {
1592    matches!(ch, '\t' | '\r' | '\n')
1593}
1594
1595/// https://url.spec.whatwg.org/#ascii-alpha
1596#[inline]
1597pub fn ascii_alpha(ch: char) -> bool {
1598    ch.is_ascii_alphabetic()
1599}
1600
1601#[inline]
1602pub fn to_u32(i: usize) -> ParseResult<u32> {
1603    if i <= u32::MAX as usize {
1604        Ok(i as u32)
1605    } else {
1606        Err(ParseError::Overflow)
1607    }
1608}
1609
1610fn is_normalized_windows_drive_letter(segment: &str) -> bool {
1611    is_windows_drive_letter(segment) && segment.as_bytes()[1] == b':'
1612}
1613
1614/// Whether the scheme is file:, the path has a single segment, and that segment
1615/// is a Windows drive letter
1616#[inline]
1617pub fn is_windows_drive_letter(segment: &str) -> bool {
1618    segment.len() == 2 && starts_with_windows_drive_letter(segment)
1619}
1620
1621/// Whether path starts with a root slash
1622/// and a windows drive letter eg: "/c:" or "/a:/"
1623fn path_starts_with_windows_drive_letter(s: &str) -> bool {
1624    if let Some(c) = s.as_bytes().first() {
1625        matches!(c, b'/' | b'\\' | b'?' | b'#') && starts_with_windows_drive_letter(&s[1..])
1626    } else {
1627        false
1628    }
1629}
1630
1631fn starts_with_windows_drive_letter(s: &str) -> bool {
1632    s.len() >= 2
1633        && ascii_alpha(s.as_bytes()[0] as char)
1634        && matches!(s.as_bytes()[1], b':' | b'|')
1635        && (s.len() == 2 || matches!(s.as_bytes()[2], b'/' | b'\\' | b'?' | b'#'))
1636}
1637
1638/// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
1639fn starts_with_windows_drive_letter_segment(input: &Input<'_>) -> bool {
1640    let mut input = input.clone();
1641    match (input.next(), input.next(), input.next()) {
1642        // its first two code points are a Windows drive letter
1643        // its third code point is U+002F (/), U+005C (\), U+003F (?), or U+0023 (#).
1644        (Some(a), Some(b), Some(c))
1645            if ascii_alpha(a) && matches!(b, ':' | '|') && matches!(c, '/' | '\\' | '?' | '#') =>
1646        {
1647            true
1648        }
1649        // its first two code points are a Windows drive letter
1650        // its length is 2
1651        (Some(a), Some(b), None) if ascii_alpha(a) && matches!(b, ':' | '|') => true,
1652        _ => false,
1653    }
1654}