icu_locale_core/langid.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use core::cmp::Ordering;
6#[cfg(feature = "alloc")]
7use core::str::FromStr;
8
9use crate::parser;
10use crate::subtags;
11use crate::ParseError;
12#[cfg(feature = "alloc")]
13use alloc::borrow::Cow;
14
15/// A core struct representing a [`Unicode BCP47 Language Identifier`].
16///
17/// # Ordering
18///
19/// This type deliberately does not implement `Ord` or `PartialOrd` because there are
20/// multiple possible orderings. Depending on your use case, two orderings are available:
21///
22/// 1. A string ordering, suitable for stable serialization: [`LanguageIdentifier::strict_cmp`]
23/// 2. A struct ordering, suitable for use with a BTreeSet: [`LanguageIdentifier::total_cmp`]
24///
25/// See issue: <https://github.com/unicode-org/icu4x/issues/1215>
26///
27/// # Parsing
28///
29/// Unicode recognizes three levels of standard conformance for any language identifier:
30///
31/// * *well-formed* - syntactically correct
32/// * *valid* - well-formed and only uses registered language, region, script and variant subtags...
33/// * *canonical* - valid and no deprecated codes or structure.
34///
35/// At the moment parsing normalizes a well-formed language identifier converting
36/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
37///
38/// Any syntactically invalid subtags will cause the parsing to fail with an error.
39///
40/// This operation normalizes syntax to be well-formed. No legacy subtag replacements is performed.
41/// For validation and canonicalization, see `LocaleCanonicalizer`.
42///
43/// # Serde
44///
45/// This type implements `serde::Serialize` and `serde::Deserialize` if the
46/// `"serde"` Cargo feature is enabled on the crate.
47///
48/// The value will be serialized as a string and parsed when deserialized.
49/// For tips on efficient storage and retrieval of locales, see [`crate::zerovec`].
50///
51/// # Examples
52///
53/// Simple example:
54///
55/// ```
56/// use icu::locale::{
57/// langid,
58/// subtags::{language, region},
59/// };
60///
61/// let li = langid!("en-US");
62///
63/// assert_eq!(li.language, language!("en"));
64/// assert_eq!(li.script, None);
65/// assert_eq!(li.region, Some(region!("US")));
66/// assert_eq!(li.variants.len(), 0);
67/// ```
68///
69/// More complex example:
70///
71/// ```
72/// use icu::locale::{
73/// langid,
74/// subtags::{language, region, script, variant},
75/// };
76///
77/// let li = langid!("eN-latn-Us-Valencia");
78///
79/// assert_eq!(li.language, language!("en"));
80/// assert_eq!(li.script, Some(script!("Latn")));
81/// assert_eq!(li.region, Some(region!("US")));
82/// assert_eq!(li.variants.first(), Some(&variant!("valencia")));
83/// ```
84///
85/// [`Unicode BCP47 Language Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_language_identifier
86#[derive(PartialEq, Eq, Clone, Hash)] // no Ord or PartialOrd: see docs
87#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
88pub struct LanguageIdentifier {
89 /// Language subtag of the language identifier.
90 pub language: subtags::Language,
91 /// Script subtag of the language identifier.
92 pub script: Option<subtags::Script>,
93 /// Region subtag of the language identifier.
94 pub region: Option<subtags::Region>,
95 /// Variant subtags of the language identifier.
96 pub variants: subtags::Variants,
97}
98
99impl LanguageIdentifier {
100 /// The unknown language identifier "und".
101 pub const UNKNOWN: Self = crate::langid!("und");
102
103 /// A constructor which takes a utf8 slice, parses it and
104 /// produces a well-formed [`LanguageIdentifier`].
105 ///
106 /// ✨ *Enabled with the `alloc` Cargo feature.*
107 ///
108 /// # Examples
109 ///
110 /// ```
111 /// use icu::locale::LanguageIdentifier;
112 ///
113 /// LanguageIdentifier::try_from_str("en-US").expect("Parsing failed");
114 /// ```
115 #[inline]
116 #[cfg(feature = "alloc")]
117 pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
118 Self::try_from_utf8(s.as_bytes())
119 }
120
121 /// See [`Self::try_from_str`]
122 ///
123 /// ✨ *Enabled with the `alloc` Cargo feature.*
124 #[cfg(feature = "alloc")]
125 pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
126 crate::parser::parse_language_identifier(code_units, parser::ParserMode::LanguageIdentifier)
127 }
128
129 #[doc(hidden)] // macro use
130 #[expect(clippy::type_complexity)]
131 // The return type should be `Result<Self, ParseError>` once the `const_precise_live_drops`
132 // is stabilized ([rust-lang#73255](https://github.com/rust-lang/rust/issues/73255)).
133 pub const fn try_from_utf8_with_single_variant(
134 code_units: &[u8],
135 ) -> Result<
136 (
137 subtags::Language,
138 Option<subtags::Script>,
139 Option<subtags::Region>,
140 Option<subtags::Variant>,
141 ),
142 ParseError,
143 > {
144 crate::parser::parse_language_identifier_with_single_variant(
145 code_units,
146 parser::ParserMode::LanguageIdentifier,
147 )
148 }
149
150 /// A constructor which takes a utf8 slice which may contain extension keys,
151 /// parses it and produces a well-formed [`LanguageIdentifier`].
152 ///
153 /// ✨ *Enabled with the `alloc` Cargo feature.*
154 ///
155 /// # Examples
156 ///
157 /// ```
158 /// use icu::locale::{langid, LanguageIdentifier};
159 ///
160 /// let li = LanguageIdentifier::try_from_locale_bytes(b"en-US-x-posix")
161 /// .expect("Parsing failed.");
162 ///
163 /// assert_eq!(li, langid!("en-US"));
164 /// ```
165 ///
166 /// This method should be used for input that may be a locale identifier.
167 /// All extensions will be lost.
168 #[cfg(feature = "alloc")]
169 pub fn try_from_locale_bytes(v: &[u8]) -> Result<Self, ParseError> {
170 parser::parse_language_identifier(v, parser::ParserMode::Locale)
171 }
172
173 /// Whether this [`LanguageIdentifier`] equals [`LanguageIdentifier::UNKNOWN`].
174 pub const fn is_unknown(&self) -> bool {
175 self.language.is_unknown()
176 && self.script.is_none()
177 && self.region.is_none()
178 && self.variants.is_empty()
179 }
180
181 /// Normalize the language identifier (operating on UTF-8 formatted byte slices)
182 ///
183 /// This operation will normalize casing and the separator.
184 ///
185 /// ✨ *Enabled with the `alloc` Cargo feature.*
186 ///
187 /// # Examples
188 ///
189 /// ```
190 /// use icu::locale::LanguageIdentifier;
191 ///
192 /// assert_eq!(
193 /// LanguageIdentifier::normalize("pL-latn-pl").as_deref(),
194 /// Ok("pl-Latn-PL")
195 /// );
196 /// ```
197 #[cfg(feature = "alloc")]
198 pub fn normalize_utf8(input: &[u8]) -> Result<Cow<'_, str>, ParseError> {
199 let lang_id = Self::try_from_utf8(input)?;
200 Ok(writeable::to_string_or_borrow(&lang_id, input))
201 }
202
203 /// Normalize the language identifier (operating on strings)
204 ///
205 /// This operation will normalize casing and the separator.
206 ///
207 /// ✨ *Enabled with the `alloc` Cargo feature.*
208 ///
209 /// # Examples
210 ///
211 /// ```
212 /// use icu::locale::LanguageIdentifier;
213 ///
214 /// assert_eq!(
215 /// LanguageIdentifier::normalize("pL-latn-pl").as_deref(),
216 /// Ok("pl-Latn-PL")
217 /// );
218 /// ```
219 #[cfg(feature = "alloc")]
220 pub fn normalize(input: &str) -> Result<Cow<'_, str>, ParseError> {
221 Self::normalize_utf8(input.as_bytes())
222 }
223
224 /// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
225 ///
226 /// The return value is equivalent to what would happen if you first converted this
227 /// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison.
228 ///
229 /// This function is case-sensitive and results in a *total order*, so it is appropriate for
230 /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
231 ///
232 /// # Examples
233 ///
234 /// Sorting a list of langids with this method requires converting one of them to a string:
235 ///
236 /// ```
237 /// use icu::locale::LanguageIdentifier;
238 /// use std::cmp::Ordering;
239 /// use writeable::Writeable;
240 ///
241 /// // Random input order:
242 /// let bcp47_strings: &[&str] = &[
243 /// "ar-Latn",
244 /// "zh-Hant-TW",
245 /// "zh-TW",
246 /// "und-fonipa",
247 /// "zh-Hant",
248 /// "ar-SA",
249 /// ];
250 ///
251 /// let mut langids = bcp47_strings
252 /// .iter()
253 /// .map(|s| s.parse().unwrap())
254 /// .collect::<Vec<LanguageIdentifier>>();
255 /// langids.sort_by(|a, b| {
256 /// let b = b.write_to_string();
257 /// a.strict_cmp(b.as_bytes())
258 /// });
259 /// let strict_cmp_strings = langids
260 /// .iter()
261 /// .map(|l| l.to_string())
262 /// .collect::<Vec<String>>();
263 ///
264 /// // Output ordering, sorted alphabetically
265 /// let expected_ordering: &[&str] = &[
266 /// "ar-Latn",
267 /// "ar-SA",
268 /// "und-fonipa",
269 /// "zh-Hant",
270 /// "zh-Hant-TW",
271 /// "zh-TW",
272 /// ];
273 ///
274 /// assert_eq!(expected_ordering, strict_cmp_strings);
275 /// ```
276 pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
277 writeable::cmp_utf8(self, other)
278 }
279
280 pub(crate) fn as_tuple(
281 &self,
282 ) -> (
283 subtags::Language,
284 Option<subtags::Script>,
285 Option<subtags::Region>,
286 &subtags::Variants,
287 ) {
288 (self.language, self.script, self.region, &self.variants)
289 }
290
291 /// Compare this [`LanguageIdentifier`] with another [`LanguageIdentifier`] field-by-field.
292 /// The result is a total ordering sufficient for use in a [`BTreeSet`].
293 ///
294 /// Unlike [`LanguageIdentifier::strict_cmp`], the ordering may or may not be equivalent
295 /// to string ordering, and it may or may not be stable across ICU4X releases.
296 ///
297 /// # Examples
298 ///
299 /// This method returns a nonsensical ordering derived from the fields of the struct:
300 ///
301 /// ```
302 /// use icu::locale::LanguageIdentifier;
303 /// use std::cmp::Ordering;
304 ///
305 /// // Input strings, sorted alphabetically
306 /// let bcp47_strings: &[&str] = &[
307 /// "ar-Latn",
308 /// "ar-SA",
309 /// "und-fonipa",
310 /// "zh-Hant",
311 /// "zh-Hant-TW",
312 /// "zh-TW",
313 /// ];
314 /// assert!(bcp47_strings.windows(2).all(|w| w[0] < w[1]));
315 ///
316 /// let mut langids = bcp47_strings
317 /// .iter()
318 /// .map(|s| s.parse().unwrap())
319 /// .collect::<Vec<LanguageIdentifier>>();
320 /// langids.sort_by(LanguageIdentifier::total_cmp);
321 /// let total_cmp_strings = langids
322 /// .iter()
323 /// .map(|l| l.to_string())
324 /// .collect::<Vec<String>>();
325 ///
326 /// // Output ordering, sorted arbitrarily
327 /// let expected_ordering: &[&str] = &[
328 /// "ar-SA",
329 /// "ar-Latn",
330 /// "und-fonipa",
331 /// "zh-TW",
332 /// "zh-Hant",
333 /// "zh-Hant-TW",
334 /// ];
335 ///
336 /// assert_eq!(expected_ordering, total_cmp_strings);
337 /// ```
338 ///
339 /// Use a wrapper to add a [`LanguageIdentifier`] to a [`BTreeSet`]:
340 ///
341 /// ```no_run
342 /// use icu::locale::LanguageIdentifier;
343 /// use std::cmp::Ordering;
344 /// use std::collections::BTreeSet;
345 ///
346 /// #[derive(PartialEq, Eq)]
347 /// struct LanguageIdentifierTotalOrd(LanguageIdentifier);
348 ///
349 /// impl Ord for LanguageIdentifierTotalOrd {
350 /// fn cmp(&self, other: &Self) -> Ordering {
351 /// self.0.total_cmp(&other.0)
352 /// }
353 /// }
354 ///
355 /// impl PartialOrd for LanguageIdentifierTotalOrd {
356 /// fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
357 /// Some(self.cmp(other))
358 /// }
359 /// }
360 ///
361 /// let _: BTreeSet<LanguageIdentifierTotalOrd> = unimplemented!();
362 /// ```
363 ///
364 /// [`BTreeSet`]: alloc::collections::BTreeSet
365 pub fn total_cmp(&self, other: &Self) -> Ordering {
366 self.as_tuple().cmp(&other.as_tuple())
367 }
368
369 /// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string.
370 ///
371 /// The return value is equivalent to what would happen if you first parsed the
372 /// BCP-47 string to a `LanguageIdentifier` and then performed a structural comparison.
373 ///
374 /// # Examples
375 ///
376 /// ```
377 /// use icu::locale::LanguageIdentifier;
378 ///
379 /// let bcp47_strings: &[&str] = &[
380 /// "pl-LaTn-pL",
381 /// "uNd",
382 /// "UnD-adlm",
383 /// "uNd-GB",
384 /// "UND-FONIPA",
385 /// "ZH",
386 /// ];
387 ///
388 /// for a in bcp47_strings {
389 /// assert!(a.parse::<LanguageIdentifier>().unwrap().normalizing_eq(a));
390 /// }
391 /// ```
392 pub fn normalizing_eq(&self, other: &str) -> bool {
393 macro_rules! subtag_matches {
394 ($T:ty, $iter:ident, $expected:expr) => {
395 $iter
396 .next()
397 .map(|b| <$T>::try_from_utf8(b) == Ok($expected))
398 .unwrap_or(false)
399 };
400 }
401
402 let mut iter = parser::SubtagIterator::new(other.as_bytes());
403 if !subtag_matches!(subtags::Language, iter, self.language) {
404 return false;
405 }
406 if let Some(ref script) = self.script {
407 if !subtag_matches!(subtags::Script, iter, *script) {
408 return false;
409 }
410 }
411 if let Some(ref region) = self.region {
412 if !subtag_matches!(subtags::Region, iter, *region) {
413 return false;
414 }
415 }
416 for variant in self.variants.iter() {
417 if !subtag_matches!(subtags::Variant, iter, *variant) {
418 return false;
419 }
420 }
421 iter.next().is_none()
422 }
423
424 pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
425 where
426 F: FnMut(&str) -> Result<(), E>,
427 {
428 f(self.language.as_str())?;
429 if let Some(ref script) = self.script {
430 f(script.as_str())?;
431 }
432 if let Some(ref region) = self.region {
433 f(region.as_str())?;
434 }
435 for variant in self.variants.iter() {
436 f(variant.as_str())?;
437 }
438 Ok(())
439 }
440
441 /// Executes `f` on each subtag string of this `LanguageIdentifier`, with every string in
442 /// lowercase ascii form.
443 ///
444 /// The default normalization of language identifiers uses titlecase scripts and uppercase
445 /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
446 ///
447 /// > _The canonical form for all subtags in the extension is lowercase, with the fields
448 /// > ordered by the separators, alphabetically._
449 ///
450 /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
451 /// normalization of the language identifier.
452 ///
453 /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
454 /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
455 /// but titlecased and uppercased outside T extensions respectively.
456 ///
457 /// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
458 /// [`Transform extensions`]: crate::extensions::transform
459 pub(crate) fn for_each_subtag_str_lowercased<E, F>(&self, f: &mut F) -> Result<(), E>
460 where
461 F: FnMut(&str) -> Result<(), E>,
462 {
463 f(self.language.as_str())?;
464 if let Some(ref script) = self.script {
465 f(script.to_tinystr().to_ascii_lowercase().as_str())?;
466 }
467 if let Some(ref region) = self.region {
468 f(region.to_tinystr().to_ascii_lowercase().as_str())?;
469 }
470 for variant in self.variants.iter() {
471 f(variant.as_str())?;
472 }
473 Ok(())
474 }
475
476 /// Writes this `LanguageIdentifier` to a sink, replacing uppercase ascii chars with
477 /// lowercase ascii chars.
478 ///
479 /// The default normalization of language identifiers uses titlecase scripts and uppercase
480 /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
481 ///
482 /// > _The canonical form for all subtags in the extension is lowercase, with the fields
483 /// > ordered by the separators, alphabetically._
484 ///
485 /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
486 /// normalization of the language identifier.
487 ///
488 /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
489 /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
490 /// but titlecased and uppercased outside T extensions respectively.
491 ///
492 /// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
493 /// [`Transform extensions`]: crate::extensions::transform
494 pub(crate) fn write_lowercased_to<W: core::fmt::Write + ?Sized>(
495 &self,
496 sink: &mut W,
497 ) -> core::fmt::Result {
498 let mut initial = true;
499 self.for_each_subtag_str_lowercased(&mut |subtag| {
500 if initial {
501 initial = false;
502 } else {
503 sink.write_char('-')?;
504 }
505 sink.write_str(subtag)
506 })
507 }
508}
509
510impl core::fmt::Debug for LanguageIdentifier {
511 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
512 core::fmt::Display::fmt(&self, f)
513 }
514}
515
516/// ✨ *Enabled with the `alloc` Cargo feature.*
517#[cfg(feature = "alloc")]
518impl FromStr for LanguageIdentifier {
519 type Err = ParseError;
520
521 #[inline]
522 fn from_str(s: &str) -> Result<Self, Self::Err> {
523 Self::try_from_str(s)
524 }
525}
526
527impl_writeable_for_each_subtag_str_no_test!(LanguageIdentifier, selff, selff.script.is_none() && selff.region.is_none() && selff.variants.is_empty() => Some(selff.language.as_str()));
528
529#[test]
530fn test_writeable() {
531 use writeable::assert_writeable_eq;
532 assert_writeable_eq!(LanguageIdentifier::UNKNOWN, "und");
533 assert_writeable_eq!("und-001".parse::<LanguageIdentifier>().unwrap(), "und-001");
534 assert_writeable_eq!(
535 "und-Mymr".parse::<LanguageIdentifier>().unwrap(),
536 "und-Mymr",
537 );
538 assert_writeable_eq!(
539 "my-Mymr-MM".parse::<LanguageIdentifier>().unwrap(),
540 "my-Mymr-MM",
541 );
542 assert_writeable_eq!(
543 "my-Mymr-MM-posix".parse::<LanguageIdentifier>().unwrap(),
544 "my-Mymr-MM-posix",
545 );
546 assert_writeable_eq!(
547 "zh-macos-posix".parse::<LanguageIdentifier>().unwrap(),
548 "zh-macos-posix",
549 );
550}
551
552/// # Examples
553///
554/// ```
555/// use icu::locale::{langid, subtags::language, LanguageIdentifier};
556///
557/// assert_eq!(LanguageIdentifier::from(language!("en")), langid!("en"));
558/// ```
559impl From<subtags::Language> for LanguageIdentifier {
560 fn from(language: subtags::Language) -> Self {
561 Self {
562 language,
563 script: None,
564 region: None,
565 variants: subtags::Variants::new(),
566 }
567 }
568}
569
570/// # Examples
571///
572/// ```
573/// use icu::locale::{langid, subtags::script, LanguageIdentifier};
574///
575/// assert_eq!(
576/// LanguageIdentifier::from(Some(script!("latn"))),
577/// langid!("und-Latn")
578/// );
579/// ```
580impl From<Option<subtags::Script>> for LanguageIdentifier {
581 fn from(script: Option<subtags::Script>) -> Self {
582 Self {
583 language: subtags::Language::UNKNOWN,
584 script,
585 region: None,
586 variants: subtags::Variants::new(),
587 }
588 }
589}
590
591/// # Examples
592///
593/// ```
594/// use icu::locale::{langid, subtags::region, LanguageIdentifier};
595///
596/// assert_eq!(
597/// LanguageIdentifier::from(Some(region!("US"))),
598/// langid!("und-US")
599/// );
600/// ```
601impl From<Option<subtags::Region>> for LanguageIdentifier {
602 fn from(region: Option<subtags::Region>) -> Self {
603 Self {
604 language: subtags::Language::UNKNOWN,
605 script: None,
606 region,
607 variants: subtags::Variants::new(),
608 }
609 }
610}
611
612/// Convert from an LSR tuple to a [`LanguageIdentifier`].
613///
614/// # Examples
615///
616/// ```
617/// use icu::locale::{
618/// langid,
619/// subtags::{language, region, script},
620/// LanguageIdentifier,
621/// };
622///
623/// let lang = language!("en");
624/// let script = script!("Latn");
625/// let region = region!("US");
626/// assert_eq!(
627/// LanguageIdentifier::from((lang, Some(script), Some(region))),
628/// langid!("en-Latn-US")
629/// );
630/// ```
631impl
632 From<(
633 subtags::Language,
634 Option<subtags::Script>,
635 Option<subtags::Region>,
636 )> for LanguageIdentifier
637{
638 fn from(
639 lsr: (
640 subtags::Language,
641 Option<subtags::Script>,
642 Option<subtags::Region>,
643 ),
644 ) -> Self {
645 Self {
646 language: lsr.0,
647 script: lsr.1,
648 region: lsr.2,
649 variants: subtags::Variants::new(),
650 }
651 }
652}
653
654/// Convert from a [`LanguageIdentifier`] to an LSR tuple.
655///
656/// # Examples
657///
658/// ```
659/// use icu::locale::{
660/// langid,
661/// subtags::{language, region, script},
662/// };
663///
664/// let lid = langid!("en-Latn-US");
665/// let (lang, script, region) = (&lid).into();
666///
667/// assert_eq!(lang, language!("en"));
668/// assert_eq!(script, Some(script!("Latn")));
669/// assert_eq!(region, Some(region!("US")));
670/// ```
671impl From<&LanguageIdentifier>
672 for (
673 subtags::Language,
674 Option<subtags::Script>,
675 Option<subtags::Region>,
676 )
677{
678 fn from(langid: &LanguageIdentifier) -> Self {
679 (langid.language, langid.script, langid.region)
680 }
681}