idna/uts46.rs
1// Copyright The rust-url developers.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9//! This module provides the lower-level API for UTS 46.
10//!
11//! [`Uts46::process`] is the core that the other convenience
12//! methods build on.
13//!
14//! UTS 46 flags map to this API as follows:
15//!
16//! * _CheckHyphens_ - _true_: [`Hyphens::Check`], _false_: [`Hyphens::Allow`]; the WHATWG URL Standard sets this to _false_ for normal (non-conformance-checker) user agents.
17//! * _CheckBidi_ - Always _true_; cannot be configured, since this flag is _true_ even when WHATWG URL Standard _beStrict_ is _false_.
18//! * _CheckJoiners_ - Always _true_; cannot be configured, since this flag is _true_ even when WHATWG URL Standard _beStrict_ is _false_.
19//! * _UseSTD3ASCIIRules_ - _true_: [`AsciiDenyList::STD3`], _false_: [`AsciiDenyList::EMPTY`]; however, the check the WHATWG URL Standard performs right after the UTS 46 invocation corresponds to [`AsciiDenyList::URL`].
20//! * _Transitional_Processing_ - Always _false_ but could be implemented as a preprocessing step. This flag is deprecated and for Web purposes the transition is over in the sense that all of Firefox, Safari, or Chrome set this flag to _false_.
21//! * _VerifyDnsLength_ - _true_: [`DnsLength::Verify`], _false_: [`DnsLength::Ignore`]; the WHATWG URL Standard sets this to _false_ for normal (non-conformance-checker) user agents.
22//! * _IgnoreInvalidPunycode_ - Always _false_; cannot be configured. (Not yet covered by the WHATWG URL Standard, but 2 out of 3 major browser clearly behave as if this was _false_).
23
24use crate::punycode::Decoder;
25use crate::punycode::InternalCaller;
26use alloc::borrow::Cow;
27use alloc::string::String;
28use core::fmt::Write;
29use idna_adapter::*;
30use smallvec::SmallVec;
31use utf8_iter::Utf8CharsEx;
32
33/// ICU4C-compatible constraint.
34/// https://unicode-org.atlassian.net/browse/ICU-13727
35const PUNYCODE_DECODE_MAX_INPUT_LENGTH: usize = 2000;
36
37/// ICU4C-compatible constraint. (Note: ICU4C measures
38/// UTF-16 and we measure UTF-32. This means that we
39/// allow longer non-BMP inputs. For this implementation,
40/// the denial-of-service scaling does not depend on BMP vs.
41/// non-BMP: only the scalar values matter.)
42///
43/// https://unicode-org.atlassian.net/browse/ICU-13727
44const PUNYCODE_ENCODE_MAX_INPUT_LENGTH: usize = 1000;
45
46/// For keeping track of what kind of numerals have been
47/// seen in an RTL label.
48#[derive(Debug, PartialEq, Eq)]
49enum RtlNumeralState {
50 Undecided,
51 European,
52 Arabic,
53}
54
55/// Computes the mask for upper-case ASCII.
56const fn upper_case_mask() -> u128 {
57 let mut accu = 0u128;
58 let mut b = 0u8;
59 while b < 128 {
60 if (b >= b'A') && (b <= b'Z') {
61 accu |= 1u128 << b;
62 }
63 b += 1;
64 }
65 accu
66}
67
68/// Bit set for upper-case ASCII.
69const UPPER_CASE_MASK: u128 = upper_case_mask();
70
71/// Computes the mask for glyphless ASCII.
72const fn glyphless_mask() -> u128 {
73 let mut accu = 0u128;
74 let mut b = 0u8;
75 while b < 128 {
76 if (b <= b' ') || (b == 0x7F) {
77 accu |= 1u128 << b;
78 }
79 b += 1;
80 }
81 accu
82}
83
84/// Bit set for glyphless ASCII.
85const GLYPHLESS_MASK: u128 = glyphless_mask();
86
87/// The mask for the ASCII dot.
88const DOT_MASK: u128 = 1 << b'.';
89
90/// Computes the ASCII deny list for STD3 ASCII rules.
91const fn ldh_mask() -> u128 {
92 let mut accu = 0u128;
93 let mut b = 0u8;
94 while b < 128 {
95 if !((b >= b'a' && b <= b'z') || (b >= b'0' && b <= b'9') || b == b'-' || b == b'.') {
96 accu |= 1u128 << b;
97 }
98 b += 1;
99 }
100 accu
101}
102
103const PUNYCODE_PREFIX: u32 =
104 ((b'-' as u32) << 24) | ((b'-' as u32) << 16) | ((b'N' as u32) << 8) | b'X' as u32;
105
106const PUNYCODE_PREFIX_MASK: u32 = (0xFF << 24) | (0xFF << 16) | (0xDF << 8) | 0xDF;
107
108fn write_punycode_label<W: Write + ?Sized>(
109 label: &[char],
110 sink: &mut W,
111) -> Result<(), ProcessingError> {
112 sink.write_str("xn--")?;
113 crate::punycode::encode_into::<_, _, InternalCaller>(label.iter().copied(), sink)?;
114 Ok(())
115}
116
117#[inline(always)]
118fn has_punycode_prefix(slice: &[u8]) -> bool {
119 if slice.len() < 4 {
120 return false;
121 }
122 // Sadly, the optimizer doesn't figure out that more idiomatic code
123 // should compile to masking on 32-bit value.
124 let a = slice[0];
125 let b = slice[1];
126 let c = slice[2];
127 let d = slice[3];
128 let u = (u32::from(d) << 24) | (u32::from(c) << 16) | (u32::from(b) << 8) | u32::from(a);
129 (u & PUNYCODE_PREFIX_MASK) == PUNYCODE_PREFIX
130}
131
132#[inline(always)]
133fn in_inclusive_range8(u: u8, start: u8, end: u8) -> bool {
134 u.wrapping_sub(start) <= (end - start)
135}
136
137#[inline(always)]
138fn in_inclusive_range_char(c: char, start: char, end: char) -> bool {
139 u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
140}
141
142#[inline(always)]
143fn is_passthrough_ascii_label(label: &[u8]) -> bool {
144 // XXX if we aren't performing _CheckHyphens_, this could
145 // check for "xn--" and pass through YouTube CDN node names.
146 if label.len() >= 4 && label[2] == b'-' && label[3] == b'-' {
147 return false;
148 }
149 if let Some((&first, tail)) = label.split_first() {
150 // We need to check the first and last character
151 // more strictly in case this turns out to be a
152 // label in a bidi domain name. This has the side
153 // effect that this function only accepts labels
154 // that also conform to the STD3 rules.
155 //
156 // XXX: If we are in the fail-fast mode (i.e. we don't need
157 // to be able to overwrite anything with U+FFFD), we could
158 // merely record that we've seen a digit here and error out
159 // if we later discover that the domain name is a bidi
160 // domain name.
161 if !in_inclusive_range8(first, b'a', b'z') {
162 return false;
163 }
164 for &b in tail {
165 // If we used LDH_MASK, we'd have to check
166 // the bytes for the ASCII range anyhow.
167 if in_inclusive_range8(b, b'a', b'z') {
168 continue;
169 }
170 if in_inclusive_range8(b, b'0', b'9') {
171 continue;
172 }
173 if b == b'-' {
174 continue;
175 }
176 return false;
177 }
178 label.last() != Some(&b'-')
179 } else {
180 // empty
181 true
182 }
183}
184
185#[inline(always)]
186fn split_ascii_fast_path_prefix(label: &[u8]) -> (&[u8], &[u8]) {
187 if let Some(pos) = label.iter().position(|b| !b.is_ascii()) {
188 if pos == 0 {
189 // First is non-ASCII
190 (&[], label)
191 } else {
192 // Leave one ASCII character in the suffix
193 // in case it's a letter that a combining
194 // character combines with.
195 let (head, tail) = label.split_at(pos - 1);
196 (head, tail)
197 }
198 } else {
199 // All ASCII
200 (label, &[])
201 }
202}
203
204// Input known to be lower-case, but may contain non-ASCII.
205#[inline(always)]
206fn apply_ascii_deny_list_to_lower_cased_unicode(c: char, deny_list: u128) -> char {
207 if let Some(shifted) = 1u128.checked_shl(u32::from(c)) {
208 if (deny_list & shifted) == 0 {
209 c
210 } else {
211 '\u{FFFD}'
212 }
213 } else {
214 c
215 }
216}
217
218// Input known to be ASCII, but may contain upper case ASCII.
219#[inline(always)]
220fn apply_ascii_deny_list_to_potentially_upper_case_ascii(b: u8, deny_list: u128) -> char {
221 if (deny_list & (1u128 << b)) == 0 {
222 return char::from(b);
223 }
224 if in_inclusive_range8(b, b'A', b'Z') {
225 return char::from(b + 0x20);
226 }
227 '\u{FFFD}'
228}
229
230#[inline(always)]
231fn is_ascii(label: &[char]) -> bool {
232 for c in label.iter() {
233 if !c.is_ascii() {
234 return false;
235 }
236 }
237 true
238}
239
240#[derive(PartialEq, Eq, Copy, Clone)]
241enum PunycodeClassification {
242 Ascii,
243 Unicode,
244 Error,
245}
246
247#[inline(always)]
248fn classify_for_punycode(label: &[char]) -> PunycodeClassification {
249 let mut iter = label.iter().copied();
250 loop {
251 if let Some(c) = iter.next() {
252 if c.is_ascii() {
253 continue;
254 }
255 if c == '\u{FFFD}' {
256 return PunycodeClassification::Error;
257 }
258 for c in iter {
259 if c == '\u{FFFD}' {
260 return PunycodeClassification::Error;
261 }
262 }
263 return PunycodeClassification::Unicode;
264 }
265 return PunycodeClassification::Ascii;
266 }
267}
268
269/// The ASCII deny list to be applied.
270#[derive(PartialEq, Eq, Copy, Clone)]
271#[repr(transparent)]
272pub struct AsciiDenyList {
273 bits: u128,
274}
275
276impl AsciiDenyList {
277 /// Computes (preferably at compile time) an ASCII deny list.
278 ///
279 /// Setting `deny_glyphless` to `true` denies U+0020 SPACE and below
280 /// as well as U+007F DELETE for convenience without having to list
281 /// these characters in the `deny_list` string.
282 ///
283 /// `deny_list` is the list of ASCII characters to deny. This
284 /// list must not contain any of:
285 /// * Letters
286 /// * Digits
287 /// * Hyphen
288 /// * Dot (period / full-stop)
289 /// * Non-ASCII
290 ///
291 /// # Panics
292 ///
293 /// If the deny list contains characters listed as prohibited above.
294 pub const fn new(deny_glyphless: bool, deny_list: &str) -> Self {
295 let mut bits = UPPER_CASE_MASK;
296 if deny_glyphless {
297 bits |= GLYPHLESS_MASK;
298 }
299 let mut i = 0;
300 let bytes = deny_list.as_bytes();
301 while i < bytes.len() {
302 let b = bytes[i];
303 assert!(b < 0x80, "ASCII deny list must be ASCII.");
304 // assert_ne not yet available in const context.
305 assert!(b != b'.', "ASCII deny list must not contain the dot.");
306 assert!(b != b'-', "ASCII deny list must not contain the hyphen.");
307 assert!(
308 !((b >= b'0') && (b <= b'9')),
309 "ASCII deny list must not contain digits."
310 );
311 assert!(
312 !((b >= b'a') && (b <= b'z')),
313 "ASCII deny list must not contain letters."
314 );
315 assert!(
316 !((b >= b'A') && (b <= b'Z')),
317 "ASCII deny list must not contain letters."
318 );
319 bits |= 1u128 << b;
320 i += 1;
321 }
322 AsciiDenyList { bits }
323 }
324
325 /// No ASCII deny list. This corresponds to _UseSTD3ASCIIRules=false_.
326 ///
327 /// Equivalent to `AsciiDenyList::new(false, "")`.
328 ///
329 /// Note: Not denying the space and control characters can result in
330 /// strange behavior. Without a deny list provided to the UTS 46
331 /// operation, the caller is expected perform filtering afterwards,
332 /// but it's more efficient to use `AsciiDenyList` than post-processing,
333 /// because the internals of this crate can optimize away checks in
334 /// certain cases.
335 pub const EMPTY: AsciiDenyList = AsciiDenyList::new(false, "");
336
337 /// The STD3 deny list. This corresponds to _UseSTD3ASCIIRules=true_.
338 ///
339 /// Note that this deny list rejects the underscore, which occurs in
340 /// pseudo-hosts used by various TXT record-based protocols, and also
341 /// characters that may occurs in non-DNS naming, such as NetBIOS.
342 pub const STD3: AsciiDenyList = AsciiDenyList { bits: ldh_mask() };
343
344 /// [Forbidden domain code point](https://url.spec.whatwg.org/#forbidden-domain-code-point) from the WHATWG URL Standard.
345 ///
346 /// Equivalent to `AsciiDenyList::new(true, "%#/:<>?@[\\]^|")`.
347 ///
348 /// Note that this deny list rejects IPv6 addresses, so (as in URL
349 /// parsing) you need to check for IPv6 addresses first and not
350 /// put them through UTS 46 processing.
351 pub const URL: AsciiDenyList = AsciiDenyList::new(true, "%#/:<>?@[\\]^|");
352}
353
354/// The _CheckHyphens_ mode.
355#[derive(PartialEq, Eq, Copy, Clone)]
356#[non_exhaustive] // non_exhaustive in case a middle mode that prohibits only first and last position needs to be added
357pub enum Hyphens {
358 /// _CheckHyphens=false_: Do not place positional restrictions on hyphens.
359 ///
360 /// This mode is used by the WHATWG URL Standard for normal User Agent processing
361 /// (i.e. not conformance checking).
362 Allow,
363
364 /// Prohibit hyphens in the first and last position in the label but allow in
365 /// the third and fourth position.
366 ///
367 /// Note that this mode rejects real-world names, including some GitHub user pages.
368 CheckFirstLast,
369
370 /// _CheckHyphens=true_: Prohibit hyphens in the first, third, fourth,
371 /// and last position in the label.
372 ///
373 /// Note that this mode rejects real-world names, including YouTube CDN nodes
374 /// and some GitHub user pages.
375 Check,
376}
377
378/// The UTS 46 _VerifyDNSLength_ flag.
379#[derive(PartialEq, Eq, Copy, Clone)]
380#[non_exhaustive]
381pub enum DnsLength {
382 /// _VerifyDNSLength=false_. (Possibly relevant for allowing non-DNS naming systems.)
383 Ignore,
384 /// _VerifyDNSLength=true_ with the exception that the trailing root label dot is
385 /// allowed.
386 VerifyAllowRootDot,
387 /// _VerifyDNSLength=true_. (The trailing root label dot is not allowed.)
388 Verify,
389}
390
391/// Policy for customizing behavior in case of an error.
392#[derive(PartialEq, Eq, Copy, Clone)]
393#[non_exhaustive]
394pub enum ErrorPolicy {
395 /// Return as early as possible without producing output in case of error.
396 FailFast,
397 /// In case of error, mark errors with the REPLACEMENT CHARACTER. (The output
398 /// containing REPLACEMENT CHARACTERs may be show to the user to illustrate
399 /// what was wrong but must not be used for naming in a network protocol.)
400 MarkErrors,
401}
402
403/// The success outcome of [`Uts46::process`]
404#[derive(PartialEq, Eq, Copy, Clone, Debug)]
405pub enum ProcessingSuccess {
406 /// There were no errors. The caller must consider the input to be the output.
407 ///
408 /// This asserts that the input can be safely passed to [`core::str::from_utf8_unchecked`].
409 ///
410 /// (Distinct from `WroteToSink` in order to allow `Cow` behavior to be implemented on top of
411 /// [`Uts46::process`].)
412 Passthrough,
413
414 /// There were no errors. The caller must consider what was written to the sink to be the output.
415 ///
416 /// (Distinct from `Passthrough` in order to allow `Cow` behavior to be implemented on top of
417 /// [`Uts46::process`].)
418 WroteToSink,
419}
420
421/// The failure outcome of [`Uts46::process`]
422#[derive(PartialEq, Eq, Copy, Clone, Debug)]
423pub enum ProcessingError {
424 /// There was a validity error according to the chosen options.
425 ///
426 /// In case of `Operation::ToAscii`, there is no output. Otherwise, output was written to the
427 /// sink and the output contains at least one U+FFFD REPLACEMENT CHARACTER to denote an error.
428 ValidityError,
429
430 /// The sink emitted [`core::fmt::Error`]. The partial output written to the sink must not
431 /// be used.
432 SinkError,
433}
434
435impl From<core::fmt::Error> for ProcessingError {
436 fn from(_: core::fmt::Error) -> Self {
437 ProcessingError::SinkError
438 }
439}
440
441impl From<crate::punycode::PunycodeEncodeError> for ProcessingError {
442 fn from(_: crate::punycode::PunycodeEncodeError) -> Self {
443 unreachable!(
444 "Punycode overflows should not be possible due to PUNYCODE_ENCODE_MAX_INPUT_LENGTH"
445 );
446 }
447}
448
449#[derive(Debug, Clone, Copy)]
450enum AlreadyAsciiLabel<'a> {
451 MixedCaseAscii(&'a [u8]),
452 MixedCasePunycode(&'a [u8]),
453 Other,
454}
455
456/// Performs the _VerifyDNSLength_ check on the output of the _ToASCII_ operation.
457///
458/// If the second argument is `false`, the trailing root label dot is allowed.
459///
460/// # Panics
461///
462/// Panics in debug mode if the argument isn't ASCII.
463pub fn verify_dns_length(domain_name: &str, allow_trailing_dot: bool) -> bool {
464 let bytes = domain_name.as_bytes();
465 debug_assert!(bytes.is_ascii());
466 let domain_name_without_trailing_dot = if let Some(without) = bytes.strip_suffix(b".") {
467 if !allow_trailing_dot {
468 return false;
469 }
470 without
471 } else {
472 bytes
473 };
474 if domain_name_without_trailing_dot.len() > 253 {
475 return false;
476 }
477 for label in domain_name_without_trailing_dot.split(|b| *b == b'.') {
478 if label.is_empty() {
479 return false;
480 }
481 if label.len() > 63 {
482 return false;
483 }
484 }
485 true
486}
487
488/// An implementation of UTS #46.
489pub struct Uts46 {
490 data: idna_adapter::Adapter,
491}
492
493#[cfg(feature = "compiled_data")]
494impl Default for Uts46 {
495 fn default() -> Self {
496 Self::new()
497 }
498}
499
500impl Uts46 {
501 /// Constructor using data compiled into the binary.
502 #[cfg(feature = "compiled_data")]
503 pub const fn new() -> Self {
504 Self {
505 data: idna_adapter::Adapter::new(),
506 }
507 }
508
509 // XXX Should there be an `icu_provider` feature for enabling
510 // a constructor for run-time data loading?
511
512 /// Performs the [ToASCII](https://www.unicode.org/reports/tr46/#ToASCII) operation
513 /// from UTS #46 with the options indicated.
514 ///
515 /// # Arguments
516 ///
517 /// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by
518 /// this method and input that is not well-formed UTF-8 is treated as an error. If you
519 /// already have a `&str`, call `.as_bytes()` on it.)
520 /// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46
521 /// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point
522 /// processing is handled via this argument. Most callers are probably the best off
523 /// by using [`AsciiDenyList::URL`] here.
524 /// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best
525 /// off by using [`Hyphens::Allow`] here.
526 /// * `dns_length` - The UTS 46 _VerifyDNSLength_ flag.
527 pub fn to_ascii<'a>(
528 &self,
529 domain_name: &'a [u8],
530 ascii_deny_list: AsciiDenyList,
531 hyphens: Hyphens,
532 dns_length: DnsLength,
533 ) -> Result<Cow<'a, str>, crate::Errors> {
534 let mut s = String::new();
535 match self.process(
536 domain_name,
537 ascii_deny_list,
538 hyphens,
539 ErrorPolicy::FailFast,
540 |_, _, _| false,
541 &mut s,
542 None,
543 ) {
544 // SAFETY: `ProcessingSuccess::Passthrough` asserts that `domain_name` is ASCII.
545 Ok(ProcessingSuccess::Passthrough) => {
546 let cow = Cow::Borrowed(unsafe { core::str::from_utf8_unchecked(domain_name) });
547 if dns_length != DnsLength::Ignore
548 && !verify_dns_length(&cow, dns_length == DnsLength::VerifyAllowRootDot)
549 {
550 Err(crate::Errors::default())
551 } else {
552 Ok(cow)
553 }
554 }
555 Ok(ProcessingSuccess::WroteToSink) => {
556 let cow: Cow<'_, str> = Cow::Owned(s);
557 if dns_length != DnsLength::Ignore
558 && !verify_dns_length(&cow, dns_length == DnsLength::VerifyAllowRootDot)
559 {
560 Err(crate::Errors::default())
561 } else {
562 Ok(cow)
563 }
564 }
565 Err(ProcessingError::ValidityError) => Err(crate::Errors::default()),
566 Err(ProcessingError::SinkError) => unreachable!(),
567 }
568 }
569
570 /// Performs the [ToUnicode](https://www.unicode.org/reports/tr46/#ToUnicode) operation
571 /// from UTS #46 according to the options given. When there
572 /// are errors, there is still output, which may be rendered user, even through
573 /// the output must not be used in networking protocols. Errors are denoted
574 /// by U+FFFD REPLACEMENT CHARACTERs in the output. (That is, if the second item of the
575 /// return tuple is `Err`, the first item of the return tuple is guaranteed to contain
576 /// at least one U+FFFD.)
577 ///
578 /// Most applications probably shouldn't use this method and should be using
579 /// [`Uts46::to_user_interface`] instead.
580 ///
581 /// # Arguments
582 ///
583 /// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by
584 /// this method and input that is not well-formed UTF-8 is treated as an error. If you
585 /// already have a `&str`, call `.as_bytes()` on it.)
586 /// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46
587 /// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point
588 /// processing is handled via this argument. Most callers are probably the best off
589 /// by using [`AsciiDenyList::URL`] here.
590 /// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best
591 /// off by using [`Hyphens::Allow`] here.
592 pub fn to_unicode<'a>(
593 &self,
594 domain_name: &'a [u8],
595 ascii_deny_list: AsciiDenyList,
596 hyphens: Hyphens,
597 ) -> (Cow<'a, str>, Result<(), crate::Errors>) {
598 self.to_user_interface(domain_name, ascii_deny_list, hyphens, |_, _, _| true)
599 }
600
601 /// Performs the [ToUnicode](https://www.unicode.org/reports/tr46/#ToUnicode) operation
602 /// from UTS #46 according to options given with some
603 /// error-free Unicode labels output according to
604 /// [ToASCII](https://www.unicode.org/reports/tr46/#ToASCII) instead as decided by
605 /// application policy implemented via the `output_as_unicode` closure. The purpose
606 /// is to convert user-visible domains to the Unicode form in general but to render
607 /// potentially misleading labels as Punycode.
608 ///
609 /// This is an imperfect security mechanism, because [the Punycode form itself may be
610 /// resemble a user-recognizable name](https://www.unicode.org/reports/tr36/#TablePunycodeSpoofing).
611 /// However, since this mechanism is common practice, this API provides support for The
612 /// the mechanism.
613 ///
614 /// ASCII labels always pass through as ASCII and labels with errors always pass through
615 /// as Unicode. For non-erroneous labels that contain at least one non-ASCII character
616 /// (implies non-empty), `output_as_unicode` is called with the Unicode form of the label,
617 /// the TLD (potentially empty), and a flag indicating whether the domain name as a whole
618 /// is a bidi domain name. If the return value is `true`, the label passes through as
619 /// Unicode. If the return value is `false`, the label is converted to Punycode.
620 ///
621 /// When there are errors, there is still output, which may be rendered user, even through
622 /// the output must not be used in networking protocols. Errors are denoted by
623 /// U+FFFD REPLACEMENT CHARACTERs in the output. (That is, if the second item
624 /// of the return tuple is `Err`, the first item of the return tuple is guaranteed to contain
625 /// at least one U+FFFD.) Labels that contain errors are not converted to Punycode.
626 ///
627 /// # Arguments
628 ///
629 /// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by
630 /// this method and input that is not well-formed UTF-8 is treated as an error. If you
631 /// already have a `&str`, call `.as_bytes()` on it.)
632 /// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46
633 /// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point
634 /// processing is handled via this argument. Most callers are probably the best off
635 /// by using [`AsciiDenyList::URL`] here.
636 /// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best
637 /// off by using [`Hyphens::Allow`] here.
638 /// * `output_as_unicode` - A closure for deciding if a label should be output as Unicode
639 /// (as opposed to Punycode). The first argument is the label for which a decision is
640 /// needed (always non-empty slice). The second argument is the TLD (potentially empty).
641 /// The third argument is `true` iff the domain name as a whole is a bidi domain name.
642 /// Only non-erroneous labels that contain at least one non-ASCII character are passed
643 /// to the closure as the first argument. The second and third argument values are
644 /// guaranteed to remain the same during a single call to `process`, and the closure
645 /// may cache computations derived from the second and third argument (hence the
646 /// `FnMut` type).
647 pub fn to_user_interface<'a, OutputUnicode: FnMut(&[char], &[char], bool) -> bool>(
648 &self,
649 domain_name: &'a [u8],
650 ascii_deny_list: AsciiDenyList,
651 hyphens: Hyphens,
652 output_as_unicode: OutputUnicode,
653 ) -> (Cow<'a, str>, Result<(), crate::Errors>) {
654 let mut s = String::new();
655 match self.process(
656 domain_name,
657 ascii_deny_list,
658 hyphens,
659 ErrorPolicy::MarkErrors,
660 output_as_unicode,
661 &mut s,
662 None,
663 ) {
664 // SAFETY: `ProcessingSuccess::Passthrough` asserts that `domain_name` is ASCII.
665 Ok(ProcessingSuccess::Passthrough) => (
666 Cow::Borrowed(unsafe { core::str::from_utf8_unchecked(domain_name) }),
667 Ok(()),
668 ),
669 Ok(ProcessingSuccess::WroteToSink) => (Cow::Owned(s), Ok(())),
670 Err(ProcessingError::ValidityError) => (Cow::Owned(s), Err(crate::Errors::default())),
671 Err(ProcessingError::SinkError) => unreachable!(),
672 }
673 }
674
675 /// The lower-level function that [`Uts46::to_ascii`], [`Uts46::to_unicode`], and
676 /// [`Uts46::to_user_interface`] are built on to allow support for output types other
677 /// than `Cow<'a, str>` (e.g. string types in a non-Rust programming language).
678 ///
679 /// # Arguments
680 ///
681 /// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by
682 /// this method and input that is not well-formed UTF-8 is treated as an error. If you
683 /// already have a `&str`, call `.as_bytes()` on it.)
684 /// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46
685 /// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point
686 /// processing is handled via this argument. Most callers are probably the best off
687 /// by using [`AsciiDenyList::URL`] here.
688 /// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best
689 /// off by using [`Hyphens::Allow`] here.
690 /// * `error_policy` - Whether to fail fast or to produce output that may be rendered
691 /// for the user to examine in case of errors.
692 /// * `output_as_unicode` - A closure for deciding if a label should be output as Unicode
693 /// (as opposed to Punycode). The first argument is the label for which a decision is
694 /// needed (always non-empty slice). The second argument is the TLD (potentially empty).
695 /// The third argument is `true` iff the domain name as a whole is a bidi domain name.
696 /// Only non-erroneous labels that contain at least one non-ASCII character are passed
697 /// to the closure as the first argument. The second and third argument values are
698 /// guaranteed to remain the same during a single call to `process`, and the closure
699 /// may cache computations derived from the second and third argument (hence the
700 /// `FnMut` type). To perform the _ToASCII_ operation, `|_, _, _| false` must be
701 /// passed as the closure. To perform the _ToUnicode_ operation, `|_, _, _| true` must
702 /// be passed as the closure. A more complex closure may be used to prepare a domain
703 /// name for display in a user interface so that labels are converted to the Unicode
704 /// form in general but potentially misleading labels are converted to the Punycode
705 /// form.
706 /// * `sink` - The object that receives the output (in the non-passthrough case).
707 /// * `ascii_sink` - A second sink that receives the _ToASCII_ form only if there
708 /// were no errors and `sink` received at least one character of non-ASCII output.
709 /// The purpose of this argument is to enable a user interface display form of the
710 /// domain and the _ToASCII_ form of the domain to be computed efficiently together.
711 /// This argument is useless when `output_as_unicode` always returns `false`, in
712 /// which case the _ToASCII_ form ends up in `sink` already. If `ascii_sink` receives
713 /// no output and the return value is `Ok(ProcessingSuccess::WroteToSink)`, use the
714 /// output received by `sink` also as the _ToASCII_ result.
715 ///
716 /// # Return value
717 ///
718 /// * `Ok(ProcessingSuccess::Passthrough)` - The caller must treat
719 /// `unsafe { core::str::from_utf8_unchecked(domain_name) }` as the output. (This
720 /// return value asserts that calling `core::str::from_utf8_unchecked(domain_name)`
721 /// is safe.)
722 /// * `Ok(ProcessingSuccess::WroteToSink)` - The caller must treat was was written
723 /// to `sink` as the output. If another sink was passed as `ascii_sink` but it did
724 /// not receive output, the caller must treat what was written to `sink` also as
725 /// the _ToASCII_ output. Otherwise, if `ascii_sink` received output, the caller
726 /// must treat what was written to `ascii_sink` as the _ToASCII_ output.
727 /// * `Err(ProcessingError::ValidityError)` - The input was in error and must
728 /// not be used for DNS lookup or otherwise in a network protocol. If `error_policy`
729 /// was `ErrorPolicy::MarkErrors`, the output written to `sink` may be displayed
730 /// to the user as an illustration of where the error was or the errors were.
731 /// * `Err(ProcessingError::SinkError)` - Either `sink` or `ascii_sink` returned
732 /// [`core::fmt::Error`]. The partial output written to `sink` `ascii_sink` must not
733 /// be used. If `W` never returns [`core::fmt::Error`], this method never returns
734 /// `Err(ProcessingError::SinkError)`.
735 ///
736 /// # Safety-usable invariant
737 ///
738 /// If the return value is `Ok(ProcessingSuccess::Passthrough)`, `domain_name` is
739 /// ASCII and `core::str::from_utf8_unchecked(domain_name)` is safe. (Note:
740 /// Other return values do _not_ imply that `domain_name` wasn't ASCII!)
741 ///
742 /// # Security considerations
743 ///
744 /// Showing labels whose Unicode form might mislead the user as Punycode instead is
745 /// an imperfect security mechanism, because [the Punycode form itself may be resemble
746 /// a user-recognizable name](https://www.unicode.org/reports/tr36/#TablePunycodeSpoofing).
747 /// However, since this mechanism is common practice, this API provides support for the
748 /// the mechanism.
749 ///
750 /// Punycode processing is quadratic, so to avoid denial of service, this method imposes
751 /// length limits on Punycode treating especially long inputs as being in error. These
752 /// limits are well higher than the DNS length limits and are not more restrictive than
753 /// the limits imposed by ICU4C.
754 #[allow(clippy::too_many_arguments)]
755 pub fn process<W: Write + ?Sized, OutputUnicode: FnMut(&[char], &[char], bool) -> bool>(
756 &self,
757 domain_name: &[u8],
758 ascii_deny_list: AsciiDenyList,
759 hyphens: Hyphens,
760 error_policy: ErrorPolicy,
761 mut output_as_unicode: OutputUnicode,
762 sink: &mut W,
763 ascii_sink: Option<&mut W>,
764 ) -> Result<ProcessingSuccess, ProcessingError> {
765 let fail_fast = error_policy == ErrorPolicy::FailFast;
766 let mut domain_buffer = SmallVec::<[char; 253]>::new();
767 let mut already_punycode = SmallVec::<[AlreadyAsciiLabel; 8]>::new();
768 // `process_inner` could be pasted inline here, but it's out of line in order
769 // to avoid duplicating that code when monomorphizing over `W` and `OutputUnicode`.
770 let (passthrough_up_to, is_bidi, had_errors) = self.process_inner(
771 domain_name,
772 ascii_deny_list,
773 hyphens,
774 fail_fast,
775 &mut domain_buffer,
776 &mut already_punycode,
777 );
778 if passthrough_up_to == domain_name.len() {
779 debug_assert!(!had_errors);
780 return Ok(ProcessingSuccess::Passthrough);
781 }
782 // Checked only after passthrough as a micro optimization.
783 if fail_fast && had_errors {
784 return Err(ProcessingError::ValidityError);
785 }
786 debug_assert_eq!(had_errors, domain_buffer.contains(&'\u{FFFD}'));
787 let without_dot = if let Some(without_dot) = domain_buffer.strip_suffix(&['.']) {
788 without_dot
789 } else {
790 &domain_buffer[..]
791 };
792 // unwrap is OK, because we always have at least one label
793 let tld = without_dot.rsplit(|c| *c == '.').next().unwrap();
794 let mut had_unicode_output = false;
795 let mut seen_label = false;
796 let mut already_punycode_iter = already_punycode.iter();
797 let mut passthrough_up_to_extended = passthrough_up_to;
798 let mut flushed_prefix = false;
799 for label in domain_buffer.split(|c| *c == '.') {
800 // Unwrap is OK, because there are supposed to be as many items in
801 // `already_punycode` as there are labels.
802 let input_punycode = *already_punycode_iter.next().unwrap();
803 if seen_label {
804 if flushed_prefix {
805 sink.write_char('.')?;
806 } else {
807 debug_assert_eq!(domain_name[passthrough_up_to_extended], b'.');
808 passthrough_up_to_extended += 1;
809 if passthrough_up_to_extended == domain_name.len() {
810 debug_assert!(!had_errors);
811 return Ok(ProcessingSuccess::Passthrough);
812 }
813 }
814 }
815 seen_label = true;
816
817 if let AlreadyAsciiLabel::MixedCaseAscii(mixed_case) = input_punycode {
818 if let Some(first_upper_case) =
819 mixed_case.iter().position(|c| c.is_ascii_uppercase())
820 {
821 let (head, tail) = mixed_case.split_at(first_upper_case);
822 let slice_to_write = if flushed_prefix {
823 head
824 } else {
825 flushed_prefix = true;
826 passthrough_up_to_extended += head.len();
827 debug_assert_ne!(passthrough_up_to_extended, domain_name.len());
828 &domain_name[..passthrough_up_to_extended]
829 };
830 // SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII.
831 sink.write_str(unsafe { core::str::from_utf8_unchecked(slice_to_write) })?;
832 for c in tail.iter() {
833 sink.write_char(char::from(c.to_ascii_lowercase()))?;
834 }
835 } else if flushed_prefix {
836 // SAFETY: `mixed_case` is known to be ASCII.
837 sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?;
838 } else {
839 passthrough_up_to_extended += mixed_case.len();
840 if passthrough_up_to_extended == domain_name.len() {
841 debug_assert!(!had_errors);
842 return Ok(ProcessingSuccess::Passthrough);
843 }
844 }
845 continue;
846 }
847
848 let potentially_punycode = if fail_fast {
849 debug_assert!(classify_for_punycode(label) != PunycodeClassification::Error);
850 !is_ascii(label)
851 } else {
852 classify_for_punycode(label) == PunycodeClassification::Unicode
853 };
854 let passthrough = if potentially_punycode {
855 let unicode = output_as_unicode(label, tld, is_bidi);
856 had_unicode_output |= unicode;
857 unicode
858 } else {
859 true
860 };
861 if passthrough {
862 if !flushed_prefix {
863 flushed_prefix = true;
864 // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
865 sink.write_str(unsafe {
866 core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended])
867 })?;
868 }
869 for c in label.iter().copied() {
870 sink.write_char(c)?;
871 }
872 } else if let AlreadyAsciiLabel::MixedCasePunycode(mixed_case) = input_punycode {
873 if let Some(first_upper_case) =
874 mixed_case.iter().position(|c| c.is_ascii_uppercase())
875 {
876 let (head, tail) = mixed_case.split_at(first_upper_case);
877 let slice_to_write = if flushed_prefix {
878 head
879 } else {
880 flushed_prefix = true;
881 passthrough_up_to_extended += head.len();
882 debug_assert_ne!(passthrough_up_to_extended, domain_name.len());
883 &domain_name[..passthrough_up_to_extended]
884 };
885 // SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII.
886 sink.write_str(unsafe { core::str::from_utf8_unchecked(slice_to_write) })?;
887 for c in tail.iter() {
888 sink.write_char(char::from(c.to_ascii_lowercase()))?;
889 }
890 } else if flushed_prefix {
891 // SAFETY: `mixed_case` is known to be ASCII.
892 sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?;
893 } else {
894 passthrough_up_to_extended += mixed_case.len();
895 if passthrough_up_to_extended == domain_name.len() {
896 debug_assert!(!had_errors);
897 return Ok(ProcessingSuccess::Passthrough);
898 }
899 }
900 } else {
901 if !flushed_prefix {
902 flushed_prefix = true;
903 // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
904 sink.write_str(unsafe {
905 core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended])
906 })?;
907 }
908 write_punycode_label(label, sink)?;
909 }
910 }
911
912 if had_errors {
913 return Err(ProcessingError::ValidityError);
914 }
915
916 if had_unicode_output {
917 if let Some(sink) = ascii_sink {
918 let mut seen_label = false;
919 let mut already_punycode_iter = already_punycode.iter();
920 let mut passthrough_up_to_extended = passthrough_up_to;
921 let mut flushed_prefix = false;
922 for label in domain_buffer.split(|c| *c == '.') {
923 // Unwrap is OK, because there are supposed to be as many items in
924 // `already_punycode` as there are labels.
925 let input_punycode = *already_punycode_iter.next().unwrap();
926 if seen_label {
927 if flushed_prefix {
928 sink.write_char('.')?;
929 } else {
930 debug_assert_eq!(domain_name[passthrough_up_to_extended], b'.');
931 passthrough_up_to_extended += 1;
932 }
933 }
934 seen_label = true;
935
936 if let AlreadyAsciiLabel::MixedCaseAscii(mixed_case) = input_punycode {
937 if let Some(first_upper_case) =
938 mixed_case.iter().position(|c| c.is_ascii_uppercase())
939 {
940 let (head, tail) = mixed_case.split_at(first_upper_case);
941 let slice_to_write = if flushed_prefix {
942 head
943 } else {
944 flushed_prefix = true;
945 passthrough_up_to_extended += head.len();
946 debug_assert_ne!(passthrough_up_to_extended, domain_name.len());
947 &domain_name[..passthrough_up_to_extended]
948 };
949 // SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII.
950 sink.write_str(unsafe {
951 core::str::from_utf8_unchecked(slice_to_write)
952 })?;
953 for c in tail.iter() {
954 sink.write_char(char::from(c.to_ascii_lowercase()))?;
955 }
956 } else if flushed_prefix {
957 // SAFETY: `mixed_case` is known to be ASCII.
958 sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?;
959 } else {
960 passthrough_up_to_extended += mixed_case.len();
961 }
962 continue;
963 }
964
965 if is_ascii(label) {
966 if !flushed_prefix {
967 flushed_prefix = true;
968 // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
969 sink.write_str(unsafe {
970 core::str::from_utf8_unchecked(
971 &domain_name[..passthrough_up_to_extended],
972 )
973 })?;
974 }
975 for c in label.iter().copied() {
976 sink.write_char(c)?;
977 }
978 } else if let AlreadyAsciiLabel::MixedCasePunycode(mixed_case) = input_punycode
979 {
980 if let Some(first_upper_case) =
981 mixed_case.iter().position(|c| c.is_ascii_uppercase())
982 {
983 let (head, tail) = mixed_case.split_at(first_upper_case);
984 let slice_to_write = if flushed_prefix {
985 head
986 } else {
987 flushed_prefix = true;
988 passthrough_up_to_extended += head.len();
989 debug_assert_ne!(passthrough_up_to_extended, domain_name.len());
990 &domain_name[..passthrough_up_to_extended]
991 };
992 // SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII.
993 sink.write_str(unsafe {
994 core::str::from_utf8_unchecked(slice_to_write)
995 })?;
996 for c in tail.iter() {
997 sink.write_char(char::from(c.to_ascii_lowercase()))?;
998 }
999 } else if flushed_prefix {
1000 // SAFETY: `mixed_case` is known to be ASCII.
1001 sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?;
1002 } else {
1003 passthrough_up_to_extended += mixed_case.len();
1004 }
1005 } else {
1006 if !flushed_prefix {
1007 flushed_prefix = true;
1008 // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
1009 sink.write_str(unsafe {
1010 core::str::from_utf8_unchecked(
1011 &domain_name[..passthrough_up_to_extended],
1012 )
1013 })?;
1014 }
1015 write_punycode_label(label, sink)?;
1016 }
1017 }
1018 if !flushed_prefix {
1019 // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII.
1020 sink.write_str(unsafe {
1021 core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended])
1022 })?;
1023 }
1024 }
1025 }
1026 Ok(ProcessingSuccess::WroteToSink)
1027 }
1028
1029 /// The part of `process` that doesn't need to be generic over the sink.
1030 #[inline(always)]
1031 fn process_inner<'a>(
1032 &self,
1033 domain_name: &'a [u8],
1034 ascii_deny_list: AsciiDenyList,
1035 hyphens: Hyphens,
1036 fail_fast: bool,
1037 domain_buffer: &mut SmallVec<[char; 253]>,
1038 already_punycode: &mut SmallVec<[AlreadyAsciiLabel<'a>; 8]>,
1039 ) -> (usize, bool, bool) {
1040 // Sadly, this even faster-path ASCII tier is needed to avoid regressing
1041 // performance.
1042 let mut iter = domain_name.iter();
1043 let mut most_recent_label_start = iter.clone();
1044 loop {
1045 if let Some(&b) = iter.next() {
1046 if in_inclusive_range8(b, b'a', b'z') {
1047 continue;
1048 }
1049 if b == b'.' {
1050 most_recent_label_start = iter.clone();
1051 continue;
1052 }
1053 return self.process_innermost(
1054 domain_name,
1055 ascii_deny_list,
1056 hyphens,
1057 fail_fast,
1058 domain_buffer,
1059 already_punycode,
1060 most_recent_label_start.as_slice(),
1061 );
1062 } else {
1063 // Success! The whole input passes through on the fastest path!
1064 return (domain_name.len(), false, false);
1065 }
1066 }
1067 }
1068
1069 /// The part of `process` that doesn't need to be generic over the sink and
1070 /// can avoid monomorphizing in the interest of code size.
1071 /// Separating this into a different stack frame compared to `process_inner`
1072 /// improves performance in the ICU4X case.
1073 #[allow(clippy::too_many_arguments)]
1074 #[inline(never)]
1075 fn process_innermost<'a>(
1076 &self,
1077 domain_name: &'a [u8],
1078 ascii_deny_list: AsciiDenyList,
1079 hyphens: Hyphens,
1080 fail_fast: bool,
1081 domain_buffer: &mut SmallVec<[char; 253]>,
1082 already_punycode: &mut SmallVec<[AlreadyAsciiLabel<'a>; 8]>,
1083 tail: &'a [u8],
1084 ) -> (usize, bool, bool) {
1085 let deny_list = ascii_deny_list.bits;
1086 let deny_list_deny_dot = deny_list | DOT_MASK;
1087
1088 let mut had_errors = false;
1089
1090 let mut passthrough_up_to = domain_name.len() - tail.len(); // Index into `domain_name`
1091 // 253 ASCII characters is the max length for a valid domain name
1092 // (excluding the root dot).
1093 let mut current_label_start; // Index into `domain_buffer`
1094 let mut seen_label = false;
1095 let mut in_prefix = true;
1096 for label in tail.split(|b| *b == b'.') {
1097 // We check for passthrough only for the prefix. That is, if we
1098 // haven't moved on and started filling `domain_buffer`. Keeping
1099 // this stuff in one loop where the first items keep being skipped
1100 // once they have been skipped at least once instead of working
1101 // this into a fancier loop structure in order to make sure that
1102 // no item from the iterator is lost or processed twice.
1103 // Furthermore, after the passthrough fails, restarting the
1104 // normalization process after each pre-existing ASCII dot also
1105 // provides an opportunity for the processing to get back onto
1106 // an ASCII fast path that bypasses the normalizer for ASCII
1107 // after a pre-existing ASCII dot (pre-existing in the sense
1108 // of not coming from e.g. normalizing an ideographic dot).
1109 if in_prefix && is_passthrough_ascii_label(label) {
1110 if seen_label {
1111 debug_assert_eq!(domain_name[passthrough_up_to], b'.');
1112 passthrough_up_to += 1;
1113 }
1114 seen_label = true;
1115
1116 passthrough_up_to += label.len();
1117 continue;
1118 }
1119 if seen_label {
1120 if in_prefix {
1121 debug_assert_eq!(domain_name[passthrough_up_to], b'.');
1122 passthrough_up_to += 1;
1123 } else {
1124 domain_buffer.push('.');
1125 }
1126 }
1127 seen_label = true;
1128 in_prefix = false;
1129 current_label_start = domain_buffer.len();
1130 if !label.is_empty() {
1131 let (ascii, non_ascii) = split_ascii_fast_path_prefix(label);
1132 let non_punycode_ascii_label = if non_ascii.is_empty() {
1133 if has_punycode_prefix(ascii) {
1134 if (ascii.last() != Some(&b'-'))
1135 && (ascii.len() - 4 <= PUNYCODE_DECODE_MAX_INPUT_LENGTH)
1136 {
1137 if let Ok(decode) =
1138 Decoder::default().decode::<u8, InternalCaller>(&ascii[4..])
1139 {
1140 // 63 ASCII characters is the max length for a valid DNS label and xn-- takes 4
1141 // characters.
1142 let mut label_buffer = SmallVec::<[char; 59]>::new();
1143 label_buffer.extend(decode);
1144
1145 if self.after_punycode_decode(
1146 domain_buffer,
1147 current_label_start,
1148 &label_buffer,
1149 deny_list_deny_dot,
1150 fail_fast,
1151 &mut had_errors,
1152 ) {
1153 return (0, false, true);
1154 }
1155
1156 if self.check_label(
1157 hyphens,
1158 &mut domain_buffer[current_label_start..],
1159 fail_fast,
1160 &mut had_errors,
1161 true,
1162 true,
1163 ) {
1164 return (0, false, true);
1165 }
1166 } else {
1167 // Punycode failed
1168 if fail_fast {
1169 return (0, false, true);
1170 }
1171 had_errors = true;
1172 domain_buffer.push('\u{FFFD}');
1173 let mut iter = ascii.iter();
1174 // Discard the first character that we replaced.
1175 let _ = iter.next();
1176 domain_buffer.extend(iter.map(|c| {
1177 // Can't have dot here, so `deny_list` vs `deny_list_deny_dot` does
1178 // not matter.
1179 apply_ascii_deny_list_to_potentially_upper_case_ascii(
1180 *c, deny_list,
1181 )
1182 }));
1183 };
1184 // If there were errors, we won't be trying to use this
1185 // anyway later, so it's fine to put it here unconditionally.
1186 already_punycode.push(AlreadyAsciiLabel::MixedCasePunycode(label));
1187 continue;
1188 } else if fail_fast {
1189 return (0, false, true);
1190 }
1191 // Else fall through to the complex path and rediscover error
1192 // there.
1193 false
1194 } else {
1195 true
1196 }
1197 } else {
1198 false
1199 };
1200 for c in ascii.iter().map(|c| {
1201 // Can't have dot here, so `deny_list` vs `deny_list_deny_dot` does
1202 // not matter.
1203 apply_ascii_deny_list_to_potentially_upper_case_ascii(*c, deny_list)
1204 }) {
1205 if c == '\u{FFFD}' {
1206 if fail_fast {
1207 return (0, false, true);
1208 }
1209 had_errors = true;
1210 }
1211 domain_buffer.push(c);
1212 }
1213 if non_punycode_ascii_label {
1214 if hyphens != Hyphens::Allow
1215 && check_hyphens(
1216 &mut domain_buffer[current_label_start..],
1217 hyphens == Hyphens::CheckFirstLast,
1218 fail_fast,
1219 &mut had_errors,
1220 )
1221 {
1222 return (0, false, true);
1223 }
1224 already_punycode.push(if had_errors {
1225 AlreadyAsciiLabel::Other
1226 } else {
1227 AlreadyAsciiLabel::MixedCaseAscii(label)
1228 });
1229 continue;
1230 }
1231 already_punycode.push(AlreadyAsciiLabel::Other);
1232 let mut first_needs_combining_mark_check = ascii.is_empty();
1233 let mut needs_contextj_check = !non_ascii.is_empty();
1234 let mut mapping = self
1235 .data
1236 .map_normalize(non_ascii.chars())
1237 .map(|c| apply_ascii_deny_list_to_lower_cased_unicode(c, deny_list));
1238 loop {
1239 let n = mapping.next();
1240 match n {
1241 None | Some('.') => {
1242 if domain_buffer[current_label_start..]
1243 .starts_with(&['x', 'n', '-', '-'])
1244 {
1245 let mut punycode_precondition_failed = false;
1246 for c in domain_buffer[current_label_start + 4..].iter_mut() {
1247 if !c.is_ascii() {
1248 if fail_fast {
1249 return (0, false, true);
1250 }
1251 had_errors = true;
1252 *c = '\u{FFFD}';
1253 punycode_precondition_failed = true;
1254 }
1255 }
1256
1257 if let Some(last) = domain_buffer.last_mut() {
1258 if *last == '-' {
1259 // Either there's nothing after the "xn--" prefix
1260 // and we got the last hyphen of "xn--", or there
1261 // are no Punycode digits after the last delimiter
1262 // which would result in Punycode decode outputting
1263 // ASCII only.
1264 if fail_fast {
1265 return (0, false, true);
1266 }
1267 had_errors = true;
1268 *last = '\u{FFFD}';
1269 punycode_precondition_failed = true;
1270 }
1271 } else {
1272 unreachable!();
1273 }
1274
1275 // Reject excessively long input
1276 // https://github.com/whatwg/url/issues/824
1277 // https://unicode-org.atlassian.net/browse/ICU-13727
1278 if domain_buffer.len() - current_label_start - 4
1279 > PUNYCODE_DECODE_MAX_INPUT_LENGTH
1280 {
1281 if fail_fast {
1282 return (0, false, true);
1283 }
1284 had_errors = true;
1285 domain_buffer[current_label_start
1286 + 4
1287 + PUNYCODE_DECODE_MAX_INPUT_LENGTH] = '\u{FFFD}';
1288 punycode_precondition_failed = true;
1289 }
1290
1291 if !punycode_precondition_failed {
1292 if let Ok(decode) = Decoder::default()
1293 .decode::<char, InternalCaller>(
1294 &domain_buffer[current_label_start + 4..],
1295 )
1296 {
1297 first_needs_combining_mark_check = true;
1298 needs_contextj_check = true;
1299 // 63 ASCII characters is the max length for a valid DNS label and xn-- takes 4
1300 // characters.
1301 let mut label_buffer = SmallVec::<[char; 59]>::new();
1302 label_buffer.extend(decode);
1303
1304 domain_buffer.truncate(current_label_start);
1305 if self.after_punycode_decode(
1306 domain_buffer,
1307 current_label_start,
1308 &label_buffer,
1309 deny_list_deny_dot,
1310 fail_fast,
1311 &mut had_errors,
1312 ) {
1313 return (0, false, true);
1314 }
1315 } else {
1316 // Punycode failed
1317 if fail_fast {
1318 return (0, false, true);
1319 }
1320 had_errors = true;
1321 domain_buffer[current_label_start] = '\u{FFFD}';
1322 needs_contextj_check = false; // ASCII label
1323 first_needs_combining_mark_check = false;
1324 };
1325 } else {
1326 first_needs_combining_mark_check = false;
1327 needs_contextj_check = false; // Non-ASCII already turned to U+FFFD.
1328 }
1329 }
1330 if self.check_label(
1331 hyphens,
1332 &mut domain_buffer[current_label_start..],
1333 fail_fast,
1334 &mut had_errors,
1335 first_needs_combining_mark_check,
1336 needs_contextj_check,
1337 ) {
1338 return (0, false, true);
1339 }
1340
1341 if n.is_none() {
1342 break;
1343 }
1344 domain_buffer.push('.');
1345 current_label_start = domain_buffer.len();
1346 first_needs_combining_mark_check = true;
1347 needs_contextj_check = true;
1348 already_punycode.push(AlreadyAsciiLabel::Other);
1349 }
1350 Some(c) => {
1351 if c == '\u{FFFD}' {
1352 if fail_fast {
1353 return (0, false, true);
1354 }
1355 had_errors = true;
1356 }
1357 domain_buffer.push(c);
1358 }
1359 }
1360 }
1361 } else {
1362 // Empty label
1363 already_punycode.push(AlreadyAsciiLabel::MixedCaseAscii(label));
1364 }
1365 }
1366
1367 let is_bidi = self.is_bidi(domain_buffer);
1368 if is_bidi {
1369 for label in domain_buffer.split_mut(|c| *c == '.') {
1370 if let Some((first, tail)) = label.split_first_mut() {
1371 let first_bc = self.data.bidi_class(*first);
1372 if !FIRST_BC_MASK.intersects(first_bc.to_mask()) {
1373 // Neither RTL label nor LTR label
1374 if fail_fast {
1375 return (0, false, true);
1376 }
1377 had_errors = true;
1378 *first = '\u{FFFD}';
1379 continue;
1380 }
1381 let is_ltr = first_bc.is_ltr();
1382 // Trim NSM
1383 let mut middle = tail;
1384 #[allow(clippy::while_let_loop)]
1385 loop {
1386 if let Some((last, prior)) = middle.split_last_mut() {
1387 let last_bc = self.data.bidi_class(*last);
1388 if last_bc.is_nonspacing_mark() {
1389 middle = prior;
1390 continue;
1391 }
1392 let last_mask = if is_ltr { LAST_LTR_MASK } else { LAST_RTL_MASK };
1393 if !last_mask.intersects(last_bc.to_mask()) {
1394 if fail_fast {
1395 return (0, false, true);
1396 }
1397 had_errors = true;
1398 *last = '\u{FFFD}';
1399 }
1400 if is_ltr {
1401 for c in prior.iter_mut() {
1402 let bc = self.data.bidi_class(*c);
1403 if !MIDDLE_LTR_MASK.intersects(bc.to_mask()) {
1404 if fail_fast {
1405 return (0, false, true);
1406 }
1407 had_errors = true;
1408 *c = '\u{FFFD}';
1409 }
1410 }
1411 } else {
1412 let mut numeral_state = RtlNumeralState::Undecided;
1413 for c in prior.iter_mut() {
1414 let bc = self.data.bidi_class(*c);
1415 if !MIDDLE_RTL_MASK.intersects(bc.to_mask()) {
1416 if fail_fast {
1417 return (0, false, true);
1418 }
1419 had_errors = true;
1420 *c = '\u{FFFD}';
1421 } else {
1422 match numeral_state {
1423 RtlNumeralState::Undecided => {
1424 if bc.is_european_number() {
1425 numeral_state = RtlNumeralState::European;
1426 } else if bc.is_arabic_number() {
1427 numeral_state = RtlNumeralState::Arabic;
1428 }
1429 }
1430 RtlNumeralState::European => {
1431 if bc.is_arabic_number() {
1432 if fail_fast {
1433 return (0, false, true);
1434 }
1435 had_errors = true;
1436 *c = '\u{FFFD}';
1437 }
1438 }
1439 RtlNumeralState::Arabic => {
1440 if bc.is_european_number() {
1441 if fail_fast {
1442 return (0, false, true);
1443 }
1444 had_errors = true;
1445 *c = '\u{FFFD}';
1446 }
1447 }
1448 }
1449 }
1450 }
1451 if (numeral_state == RtlNumeralState::European
1452 && last_bc.is_arabic_number())
1453 || (numeral_state == RtlNumeralState::Arabic
1454 && last_bc.is_european_number())
1455 {
1456 if fail_fast {
1457 return (0, false, true);
1458 }
1459 had_errors = true;
1460 *last = '\u{FFFD}';
1461 }
1462 }
1463 break;
1464 } else {
1465 // One-character label or label where
1466 // everything after the first character
1467 // is just non-spacing marks.
1468 break;
1469 }
1470 }
1471 }
1472 }
1473 }
1474
1475 (passthrough_up_to, is_bidi, had_errors)
1476 }
1477
1478 #[inline(never)]
1479 fn after_punycode_decode(
1480 &self,
1481 domain_buffer: &mut SmallVec<[char; 253]>,
1482 current_label_start: usize,
1483 label_buffer: &[char],
1484 deny_list_deny_dot: u128,
1485 fail_fast: bool,
1486 had_errors: &mut bool,
1487 ) -> bool {
1488 for c in self
1489 .data
1490 .normalize_validate(label_buffer.iter().copied())
1491 .map(|c| apply_ascii_deny_list_to_lower_cased_unicode(c, deny_list_deny_dot))
1492 {
1493 if c == '\u{FFFD}' {
1494 if fail_fast {
1495 return true;
1496 }
1497 *had_errors = true;
1498 }
1499 domain_buffer.push(c);
1500 }
1501 let normalized = &mut domain_buffer[current_label_start..];
1502 if let Err(()) =
1503 normalized
1504 .iter_mut()
1505 .zip(label_buffer.iter())
1506 .try_for_each(|(norm_c, decoded_c)| {
1507 if *norm_c == *decoded_c {
1508 Ok(())
1509 } else {
1510 // Mark the first difference
1511 *norm_c = '\u{FFFD}';
1512 Err(())
1513 }
1514 })
1515 {
1516 if fail_fast {
1517 return true;
1518 }
1519 *had_errors = true;
1520 }
1521 false
1522 }
1523
1524 #[inline(never)]
1525 fn check_label(
1526 &self,
1527 hyphens: Hyphens,
1528 mut_label: &mut [char],
1529 fail_fast: bool,
1530 had_errors: &mut bool,
1531 first_needs_combining_mark_check: bool,
1532 needs_contextj_check: bool,
1533 ) -> bool {
1534 if hyphens != Hyphens::Allow
1535 && check_hyphens(
1536 mut_label,
1537 hyphens == Hyphens::CheckFirstLast,
1538 fail_fast,
1539 had_errors,
1540 )
1541 {
1542 return true;
1543 }
1544 if first_needs_combining_mark_check {
1545 if let Some(first) = mut_label.first_mut() {
1546 if self.data.is_mark(*first) {
1547 if fail_fast {
1548 return true;
1549 }
1550 *had_errors = true;
1551 *first = '\u{FFFD}';
1552 }
1553 }
1554 }
1555 if needs_contextj_check {
1556 // ContextJ
1557 for i in 0..mut_label.len() {
1558 let c = mut_label[i];
1559 if !in_inclusive_range_char(c, '\u{200C}', '\u{200D}') {
1560 continue;
1561 }
1562 let (head, joiner_and_tail) = mut_label.split_at_mut(i);
1563
1564 if let Some((joiner, tail)) = joiner_and_tail.split_first_mut() {
1565 if let Some(previous) = head.last() {
1566 if self.data.is_virama(*previous) {
1567 continue;
1568 }
1569 } else {
1570 // No preceding character
1571 if fail_fast {
1572 return true;
1573 }
1574 *had_errors = true;
1575 *joiner = '\u{FFFD}';
1576 continue;
1577 }
1578 if c == '\u{200D}' {
1579 // ZWJ only has the virama rule
1580 if fail_fast {
1581 return true;
1582 }
1583 *had_errors = true;
1584 *joiner = '\u{FFFD}';
1585 continue;
1586 }
1587 debug_assert_eq!(c, '\u{200C}');
1588 if !self.has_appropriately_joining_char(
1589 head.iter().rev().copied(),
1590 LEFT_OR_DUAL_JOINING_MASK,
1591 ) || !self.has_appropriately_joining_char(
1592 tail.iter().copied(),
1593 RIGHT_OR_DUAL_JOINING_MASK,
1594 ) {
1595 if fail_fast {
1596 return true;
1597 }
1598 *had_errors = true;
1599 *joiner = '\u{FFFD}';
1600 }
1601 } else {
1602 debug_assert!(false);
1603 }
1604 }
1605 }
1606
1607 if !is_ascii(mut_label) && mut_label.len() > PUNYCODE_ENCODE_MAX_INPUT_LENGTH {
1608 // Limit quadratic behavior
1609 // https://github.com/whatwg/url/issues/824
1610 // https://unicode-org.atlassian.net/browse/ICU-13727
1611 if fail_fast {
1612 return true;
1613 }
1614 *had_errors = true;
1615 mut_label[PUNYCODE_ENCODE_MAX_INPUT_LENGTH] = '\u{FFFD}';
1616 }
1617 false
1618 }
1619
1620 #[inline(always)]
1621 fn has_appropriately_joining_char<I: Iterator<Item = char>>(
1622 &self,
1623 iter: I,
1624 required_mask: JoiningTypeMask,
1625 ) -> bool {
1626 for c in iter {
1627 let jt = self.data.joining_type(c);
1628 if jt.to_mask().intersects(required_mask) {
1629 return true;
1630 }
1631 if jt.is_transparent() {
1632 continue;
1633 }
1634 return false;
1635 }
1636 false
1637 }
1638
1639 #[inline(always)]
1640 fn is_bidi(&self, buffer: &[char]) -> bool {
1641 for &c in buffer {
1642 if c < '\u{0590}' {
1643 // Below Hebrew
1644 continue;
1645 }
1646 if in_inclusive_range_char(c, '\u{0900}', '\u{FB1C}') {
1647 debug_assert_ne!(c, '\u{200F}'); // disallowed
1648 continue;
1649 }
1650 if in_inclusive_range_char(c, '\u{1F000}', '\u{3FFFF}') {
1651 continue;
1652 }
1653 if in_inclusive_range_char(c, '\u{FF00}', '\u{107FF}') {
1654 continue;
1655 }
1656 if in_inclusive_range_char(c, '\u{11000}', '\u{1E7FF}') {
1657 continue;
1658 }
1659 if RTL_MASK.intersects(self.data.bidi_class(c).to_mask()) {
1660 return true;
1661 }
1662 }
1663 false
1664 }
1665}
1666
1667fn check_hyphens(
1668 mut_label: &mut [char],
1669 allow_third_fourth: bool,
1670 fail_fast: bool,
1671 had_errors: &mut bool,
1672) -> bool {
1673 if let Some(first) = mut_label.first_mut() {
1674 if *first == '-' {
1675 if fail_fast {
1676 return true;
1677 }
1678 *had_errors = true;
1679 *first = '\u{FFFD}';
1680 }
1681 }
1682 if let Some(last) = mut_label.last_mut() {
1683 if *last == '-' {
1684 if fail_fast {
1685 return true;
1686 }
1687 *had_errors = true;
1688 *last = '\u{FFFD}';
1689 }
1690 }
1691 if allow_third_fourth {
1692 return false;
1693 }
1694 if mut_label.len() >= 4 && mut_label[2] == '-' && mut_label[3] == '-' {
1695 if fail_fast {
1696 return true;
1697 }
1698 *had_errors = true;
1699 mut_label[2] = '\u{FFFD}';
1700 mut_label[3] = '\u{FFFD}';
1701 }
1702 false
1703}