1use crate::provider::*;
8use crate::LocaleTransformError;
9use alloc::vec::Vec;
10use core::cmp::Ordering;
11
12use crate::LocaleExpander;
13use crate::TransformResult;
14use icu_locid::extensions::Extensions;
15use icu_locid::subtags::{Language, Region, Script};
16use icu_locid::{
17 extensions::unicode::key,
18 subtags::{language, Variant, Variants},
19 LanguageIdentifier, Locale,
20};
21use icu_provider::prelude::*;
22use tinystr::TinyAsciiStr;
23
24#[derive(Debug)]
41pub struct LocaleCanonicalizer {
42 aliases: DataPayload<AliasesV2Marker>,
44 expander: LocaleExpander,
46}
47
48fn uts35_rule_matches<'a, I>(
49 source: &LanguageIdentifier,
50 language: Language,
51 script: Option<Script>,
52 region: Option<Region>,
53 raw_variants: I,
54) -> bool
55where
56 I: Iterator<Item = &'a str>,
57{
58 (language.is_empty() || language == source.language)
59 && (script.is_none() || script == source.script)
60 && (region.is_none() || region == source.region)
61 && {
62 let mut source_variants = source.variants.iter();
65 'outer: for raw_variant in raw_variants {
66 for source_variant in source_variants.by_ref() {
67 match source_variant.strict_cmp(raw_variant.as_bytes()) {
68 Ordering::Equal => {
69 continue 'outer;
71 }
72 Ordering::Less => {
73 }
75 Ordering::Greater => {
76 return false;
79 }
80 }
81 }
82 return false;
84 }
85 true
86 }
87}
88
89fn uts35_replacement<'a, I>(
90 source: &mut LanguageIdentifier,
91 ruletype_has_language: bool,
92 ruletype_has_script: bool,
93 ruletype_has_region: bool,
94 ruletype_variants: Option<I>,
95 replacement: &LanguageIdentifier,
96) where
97 I: Iterator<Item = &'a str>,
98{
99 if ruletype_has_language || (source.language.is_empty() && !replacement.language.is_empty()) {
100 source.language = replacement.language;
101 }
102 if ruletype_has_script || (source.script.is_none() && replacement.script.is_some()) {
103 source.script = replacement.script;
104 }
105 if ruletype_has_region || (source.region.is_none() && replacement.region.is_some()) {
106 source.region = replacement.region;
107 }
108 if let Some(skips) = ruletype_variants {
109 let mut sources = source.variants.iter().peekable();
117 let mut replacements = replacement.variants.iter().peekable();
118 let mut skips = skips.peekable();
119
120 let mut variants: Vec<Variant> = Vec::new();
121
122 loop {
123 match (sources.peek(), skips.peek(), replacements.peek()) {
124 (Some(&source), Some(skip), _)
125 if source.strict_cmp(skip.as_bytes()) == Ordering::Greater =>
126 {
127 skips.next();
128 }
129 (Some(&source), Some(skip), _)
130 if source.strict_cmp(skip.as_bytes()) == Ordering::Equal =>
131 {
132 skips.next();
133 sources.next();
134 }
135 (Some(&source), _, Some(&replacement))
136 if replacement.cmp(source) == Ordering::Less =>
137 {
138 variants.push(*replacement);
139 replacements.next();
140 }
141 (Some(&source), _, Some(&replacement))
142 if replacement.cmp(source) == Ordering::Equal =>
143 {
144 variants.push(*source);
145 sources.next();
146 replacements.next();
147 }
148 (Some(&source), _, _) => {
149 variants.push(*source);
150 sources.next();
151 }
152 (None, _, Some(&replacement)) => {
153 variants.push(*replacement);
154 replacements.next();
155 }
156 (None, _, None) => {
157 break;
158 }
159 }
160 }
161 source.variants = Variants::from_vec_unchecked(variants);
162 }
163}
164
165#[inline]
166fn uts35_check_language_rules(
167 langid: &mut LanguageIdentifier,
168 alias_data: &DataPayload<AliasesV2Marker>,
169) -> TransformResult {
170 if !langid.language.is_empty() {
171 let lang: TinyAsciiStr<3> = langid.language.into();
172 let replacement = if lang.len() == 2 {
173 alias_data
174 .get()
175 .language_len2
176 .get(&lang.resize().to_unvalidated())
177 } else {
178 alias_data.get().language_len3.get(&lang.to_unvalidated())
179 };
180
181 if let Some(replacement) = replacement {
182 if let Ok(new_langid) = replacement.parse() {
183 uts35_replacement::<core::iter::Empty<&str>>(
184 langid,
185 true,
186 false,
187 false,
188 None,
189 &new_langid,
190 );
191 return TransformResult::Modified;
192 }
193 }
194 }
195
196 TransformResult::Unmodified
197}
198
199#[cfg(feature = "compiled_data")]
200impl Default for LocaleCanonicalizer {
201 fn default() -> Self {
202 Self::new()
203 }
204}
205
206impl LocaleCanonicalizer {
207 #[cfg(feature = "compiled_data")]
213 pub const fn new() -> Self {
214 Self::new_with_expander(LocaleExpander::new_extended())
215 }
216
217 #[doc = icu_provider::gen_any_buffer_unstable_docs!(ANY, Self::new)]
219 pub fn try_new_with_any_provider(
220 provider: &(impl AnyProvider + ?Sized),
221 ) -> Result<Self, LocaleTransformError> {
222 let expander = LocaleExpander::try_new_with_any_provider(provider)?;
223 Self::try_new_with_expander_compat(&provider.as_downcasting(), expander)
224 }
225
226 #[doc = icu_provider::gen_any_buffer_unstable_docs!(BUFFER, Self::new)]
228 #[cfg(feature = "serde")]
229 pub fn try_new_with_buffer_provider(
230 provider: &(impl BufferProvider + ?Sized),
231 ) -> Result<Self, LocaleTransformError> {
232 let expander = LocaleExpander::try_new_with_buffer_provider(provider)?;
233 Self::try_new_with_expander_compat(&provider.as_deserializing(), expander)
234 }
235
236 #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
237 pub fn try_new_unstable<P>(provider: &P) -> Result<Self, LocaleTransformError>
238 where
239 P: DataProvider<AliasesV2Marker>
240 + DataProvider<LikelySubtagsForLanguageV1Marker>
241 + DataProvider<LikelySubtagsForScriptRegionV1Marker>
242 + ?Sized,
243 {
244 let expander = LocaleExpander::try_new_unstable(provider)?;
245 Self::try_new_with_expander_unstable(provider, expander)
246 }
247
248 #[cfg(feature = "compiled_data")]
254 pub const fn new_with_expander(expander: LocaleExpander) -> Self {
255 Self {
256 aliases: DataPayload::from_static_ref(
257 crate::provider::Baked::SINGLETON_LOCID_TRANSFORM_ALIASES_V2,
258 ),
259 expander,
260 }
261 }
262
263 fn try_new_with_expander_compat<P>(
264 provider: &P,
265 expander: LocaleExpander,
266 ) -> Result<Self, LocaleTransformError>
267 where
268 P: DataProvider<AliasesV2Marker> + DataProvider<AliasesV1Marker> + ?Sized,
269 {
270 let payload_v2: Result<DataPayload<AliasesV2Marker>, _> = provider
271 .load(Default::default())
272 .and_then(DataResponse::take_payload);
273 let aliases = if let Ok(payload) = payload_v2 {
274 payload
275 } else {
276 let payload_v1: DataPayload<AliasesV1Marker> = provider
277 .load(Default::default())
278 .and_then(DataResponse::take_payload)?;
279 payload_v1.try_map_project(|st, _| st.try_into())?
280 };
281
282 Ok(Self { aliases, expander })
283 }
284
285 #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_with_expander)]
286 pub fn try_new_with_expander_unstable<P>(
287 provider: &P,
288 expander: LocaleExpander,
289 ) -> Result<Self, LocaleTransformError>
290 where
291 P: DataProvider<AliasesV2Marker> + ?Sized,
292 {
293 let aliases: DataPayload<AliasesV2Marker> =
294 provider.load(Default::default())?.take_payload()?;
295
296 Ok(Self { aliases, expander })
297 }
298
299 #[doc = icu_provider::gen_any_buffer_unstable_docs!(ANY, Self::new_with_expander)]
300 pub fn try_new_with_expander_with_any_provider(
301 provider: &(impl AnyProvider + ?Sized),
302 options: LocaleExpander,
303 ) -> Result<Self, LocaleTransformError> {
304 Self::try_new_with_expander_compat(&provider.as_downcasting(), options)
305 }
306
307 #[cfg(feature = "serde")]
308 #[doc = icu_provider::gen_any_buffer_unstable_docs!(BUFFER,Self::new_with_expander)]
309 pub fn try_new_with_expander_with_buffer_provider(
310 provider: &(impl BufferProvider + ?Sized),
311 options: LocaleExpander,
312 ) -> Result<Self, LocaleTransformError> {
313 Self::try_new_with_expander_compat(&provider.as_deserializing(), options)
314 }
315
316 pub fn canonicalize(&self, locale: &mut Locale) -> TransformResult {
339 let mut result = TransformResult::Unmodified;
340
341 loop {
344 let modified = if locale.id.variants.is_empty() {
350 self.canonicalize_absolute_language_fallbacks(&mut locale.id)
351 } else {
352 self.canonicalize_language_variant_fallbacks(&mut locale.id)
353 };
354 if modified {
355 result = TransformResult::Modified;
356 continue;
357 }
358
359 if !locale.id.language.is_empty() {
360 if let Some(region) = locale.id.region {
362 if locale.id.language == language!("sgn") {
363 if let Some(&sgn_lang) = self
364 .aliases
365 .get()
366 .sgn_region
367 .get(®ion.into_tinystr().to_unvalidated())
368 {
369 uts35_replacement::<core::iter::Empty<&str>>(
370 &mut locale.id,
371 true,
372 false,
373 true,
374 None,
375 &sgn_lang.into(),
376 );
377 result = TransformResult::Modified;
378 continue;
379 }
380 }
381 }
382
383 if uts35_check_language_rules(&mut locale.id, &self.aliases)
384 == TransformResult::Modified
385 {
386 result = TransformResult::Modified;
387 continue;
388 }
389 }
390
391 if let Some(script) = locale.id.script {
392 if let Some(&replacement) = self
393 .aliases
394 .get()
395 .script
396 .get(&script.into_tinystr().to_unvalidated())
397 {
398 locale.id.script = Some(replacement);
399 result = TransformResult::Modified;
400 continue;
401 }
402 }
403
404 if let Some(region) = locale.id.region {
405 let replacement = if region.is_alphabetic() {
406 self.aliases
407 .get()
408 .region_alpha
409 .get(®ion.into_tinystr().resize().to_unvalidated())
410 } else {
411 self.aliases
412 .get()
413 .region_num
414 .get(®ion.into_tinystr().to_unvalidated())
415 };
416 if let Some(&replacement) = replacement {
417 locale.id.region = Some(replacement);
418 result = TransformResult::Modified;
419 continue;
420 }
421
422 if let Some(regions) = self
423 .aliases
424 .get()
425 .complex_region
426 .get(®ion.into_tinystr().to_unvalidated())
427 {
428 if let Some(default_region) = regions.get(0) {
430 let mut maximized = LanguageIdentifier {
431 language: locale.id.language,
432 script: locale.id.script,
433 region: None,
434 variants: Variants::default(),
435 };
436
437 locale.id.region = Some(
438 match (self.expander.maximize(&mut maximized), maximized.region) {
439 (TransformResult::Modified, Some(candidate))
440 if regions.iter().any(|x| x == candidate) =>
441 {
442 candidate
443 }
444 _ => default_region,
445 },
446 );
447 result = TransformResult::Modified;
448 continue;
449 }
450 }
451 }
452
453 if !locale.id.variants.is_empty() {
454 let mut modified = Vec::with_capacity(0);
455 for (idx, &variant) in locale.id.variants.iter().enumerate() {
456 if let Some(&updated) = self
457 .aliases
458 .get()
459 .variant
460 .get(&variant.into_tinystr().to_unvalidated())
461 {
462 if modified.is_empty() {
463 modified = locale.id.variants.to_vec();
464 }
465 #[allow(clippy::indexing_slicing)]
466 let _ = core::mem::replace(&mut modified[idx], updated);
467 }
468 }
469
470 if !modified.is_empty() {
471 modified.sort();
472 modified.dedup();
473 locale.id.variants = Variants::from_vec_unchecked(modified);
474 result = TransformResult::Modified;
475 continue;
476 }
477 }
478
479 break;
481 }
482
483 if !locale.extensions.transform.is_empty() || !locale.extensions.unicode.is_empty() {
484 self.canonicalize_extensions(&mut locale.extensions, &mut result);
485 }
486 result
487 }
488
489 fn canonicalize_extensions(&self, extensions: &mut Extensions, result: &mut TransformResult) {
490 if let Some(ref mut lang) = extensions.transform.lang {
493 while uts35_check_language_rules(lang, &self.aliases) == TransformResult::Modified {
494 *result = TransformResult::Modified;
495 }
496 }
497
498 if !extensions.unicode.keywords.is_empty() {
499 for key in [key!("rg"), key!("sd")] {
500 if let Some(value) = extensions.unicode.keywords.get_mut(&key) {
501 if let &[only_value] = value.as_tinystr_slice() {
502 if let Some(modified_value) = self
503 .aliases
504 .get()
505 .subdivision
506 .get(&only_value.resize().to_unvalidated())
507 {
508 if let Ok(modified_value) = modified_value.parse() {
509 *value = modified_value;
510 *result = TransformResult::Modified;
511 }
512 }
513 }
514 }
515 }
516 }
517 }
518
519 fn canonicalize_language_variant_fallbacks(&self, lid: &mut LanguageIdentifier) -> bool {
520 for LanguageStrStrPair(lang, raw_variants, raw_to) in self
522 .aliases
523 .get()
524 .language_variants
525 .iter()
526 .map(zerofrom::ZeroFrom::zero_from)
527 {
528 let raw_variants = raw_variants.split('-');
529 if uts35_rule_matches(lid, lang, None, None, raw_variants.clone()) {
531 if let Ok(to) = raw_to.parse() {
532 uts35_replacement(lid, !lang.is_empty(), false, false, Some(raw_variants), &to);
533 return true;
534 }
535 }
536 }
537 false
538 }
539
540 fn canonicalize_absolute_language_fallbacks(&self, lid: &mut LanguageIdentifier) -> bool {
541 for StrStrPair(raw_from, raw_to) in self
542 .aliases
543 .get()
544 .language
545 .iter()
546 .map(zerofrom::ZeroFrom::zero_from)
547 {
548 if let Ok(from) = raw_from.parse::<LanguageIdentifier>() {
549 if uts35_rule_matches(
550 lid,
551 from.language,
552 from.script,
553 from.region,
554 from.variants.iter().map(Variant::as_str),
555 ) {
556 if let Ok(to) = raw_to.parse() {
557 uts35_replacement(
558 lid,
559 !from.language.is_empty(),
560 from.script.is_some(),
561 from.region.is_some(),
562 Some(from.variants.iter().map(Variant::as_str)),
563 &to,
564 );
565 return true;
566 }
567 }
568 }
569 }
570 false
571 }
572}
573
574#[cfg(test)]
575mod test {
576 use super::*;
577
578 #[test]
579 fn test_uts35_rule_matches() {
580 for (source, rule, result) in [
581 ("ja", "und", true),
582 ("und-heploc-hepburn", "und-hepburn", true),
583 ("ja-heploc-hepburn", "und-hepburn", true),
584 ("ja-hepburn", "und-hepburn-heploc", false),
585 ] {
586 let source = source.parse().unwrap();
587 let rule = rule.parse::<LanguageIdentifier>().unwrap();
588 assert_eq!(
589 uts35_rule_matches(
590 &source,
591 rule.language,
592 rule.script,
593 rule.region,
594 rule.variants.iter().map(Variant::as_str),
595 ),
596 result,
597 "{}",
598 source
599 );
600 }
601 }
602
603 #[test]
604 fn test_uts35_replacement() {
605 for (locale, rule_0, rule_1, result) in [
606 (
607 "ja-Latn-fonipa-hepburn-heploc",
608 "und-hepburn-heploc",
609 "und-alalc97",
610 "ja-Latn-alalc97-fonipa",
611 ),
612 ("sgn-DD", "und-DD", "und-DE", "sgn-DE"),
613 ("sgn-DE", "sgn-DE", "gsg", "gsg"),
614 ] {
615 let mut locale: Locale = locale.parse().unwrap();
616 let rule_0 = rule_0.parse::<LanguageIdentifier>().unwrap();
617 let rule_1 = rule_1.parse().unwrap();
618 let result = result.parse::<Locale>().unwrap();
619 uts35_replacement(
620 &mut locale.id,
621 !rule_0.language.is_empty(),
622 rule_0.script.is_some(),
623 rule_0.region.is_some(),
624 Some(rule_0.variants.iter().map(Variant::as_str)),
625 &rule_1,
626 );
627 assert_eq!(result, locale);
628 }
629 }
630}
631
632#[cfg(feature = "serde")]
633#[cfg(test)]
634mod tests {
635 use super::*;
636 use icu_locid::locale;
637
638 struct RejectByKeyProvider {
639 keys: Vec<DataKey>,
640 }
641
642 impl AnyProvider for RejectByKeyProvider {
643 fn load_any(&self, key: DataKey, _: DataRequest) -> Result<AnyResponse, DataError> {
644 use alloc::borrow::Cow;
645
646 println!("{:#?}", key);
647 if self.keys.contains(&key) {
648 return Err(DataErrorKind::MissingDataKey.with_str_context("rejected"));
649 }
650
651 let aliases_v2 = crate::provider::Baked::SINGLETON_LOCID_TRANSFORM_ALIASES_V2;
652 let l = crate::provider::Baked::SINGLETON_LOCID_TRANSFORM_LIKELYSUBTAGS_L_V1;
653 let ext = crate::provider::Baked::SINGLETON_LOCID_TRANSFORM_LIKELYSUBTAGS_EXT_V1;
654 let sr = crate::provider::Baked::SINGLETON_LOCID_TRANSFORM_LIKELYSUBTAGS_SR_V1;
655
656 let payload = if key.hashed() == AliasesV1Marker::KEY.hashed() {
657 let aliases_v1 = AliasesV1 {
658 language_variants: zerovec::VarZeroVec::from(&[StrStrPair(
659 Cow::Borrowed("aa-saaho"),
660 Cow::Borrowed("ssy"),
661 )]),
662 ..Default::default()
663 };
664 DataPayload::<AliasesV1Marker>::from_owned(aliases_v1).wrap_into_any_payload()
665 } else if key.hashed() == AliasesV2Marker::KEY.hashed() {
666 DataPayload::<AliasesV2Marker>::from_static_ref(aliases_v2).wrap_into_any_payload()
667 } else if key.hashed() == LikelySubtagsForLanguageV1Marker::KEY.hashed() {
668 DataPayload::<LikelySubtagsForLanguageV1Marker>::from_static_ref(l)
669 .wrap_into_any_payload()
670 } else if key.hashed() == LikelySubtagsExtendedV1Marker::KEY.hashed() {
671 DataPayload::<LikelySubtagsExtendedV1Marker>::from_static_ref(ext)
672 .wrap_into_any_payload()
673 } else if key.hashed() == LikelySubtagsForScriptRegionV1Marker::KEY.hashed() {
674 DataPayload::<LikelySubtagsForScriptRegionV1Marker>::from_static_ref(sr)
675 .wrap_into_any_payload()
676 } else {
677 return Err(DataErrorKind::MissingDataKey.into_error());
678 };
679
680 Ok(AnyResponse {
681 payload: Some(payload),
682 metadata: Default::default(),
683 })
684 }
685 }
686
687 #[test]
688 fn test_old_keys() {
689 let provider = RejectByKeyProvider {
690 keys: vec![AliasesV2Marker::KEY],
691 };
692 let lc = LocaleCanonicalizer::try_new_with_any_provider(&provider)
693 .expect("should create with old keys");
694 let mut locale = locale!("aa-saaho");
695 assert_eq!(lc.canonicalize(&mut locale), TransformResult::Modified);
696 assert_eq!(locale, locale!("ssy"));
697 }
698
699 #[test]
700 fn test_new_keys() {
701 let provider = RejectByKeyProvider {
702 keys: vec![AliasesV1Marker::KEY],
703 };
704 let lc = LocaleCanonicalizer::try_new_with_any_provider(&provider)
705 .expect("should create with old keys");
706 let mut locale = locale!("aa-saaho");
707 assert_eq!(lc.canonicalize(&mut locale), TransformResult::Modified);
708 assert_eq!(locale, locale!("ssy"));
709 }
710
711 #[test]
712 fn test_no_keys() {
713 let provider = RejectByKeyProvider {
714 keys: vec![AliasesV1Marker::KEY, AliasesV2Marker::KEY],
715 };
716 if LocaleCanonicalizer::try_new_with_any_provider(&provider).is_ok() {
717 panic!("should not create: no data present")
718 };
719 }
720}