icu_locid_transform/fallback/
algorithms.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use icu_locid::extensions::unicode::{key, Key};
6use icu_locid::subtags::Language;
7use icu_locid::LanguageIdentifier;
8use icu_provider::FallbackPriority;
9
10use super::*;
11
12const SUBDIVISION_KEY: Key = key!("sd");
13
14impl<'a> LocaleFallbackerWithConfig<'a> {
15    pub(crate) fn normalize(&self, locale: &mut DataLocale) {
16        let language = locale.language();
17        // 1. Populate the region (required for region fallback only)
18        if self.config.priority == FallbackPriority::Region && locale.region().is_none() {
19            // 1a. First look for region based on language+script
20            if let Some(script) = locale.script() {
21                locale.set_region(
22                    self.likely_subtags
23                        .ls2r
24                        .get_2d(
25                            &language.into_tinystr().to_unvalidated(),
26                            &script.into_tinystr().to_unvalidated(),
27                        )
28                        .copied(),
29                );
30            }
31            // 1b. If that fails, try language only
32            if locale.region().is_none() {
33                locale.set_region(
34                    self.likely_subtags
35                        .l2r
36                        .get(&language.into_tinystr().to_unvalidated())
37                        .copied(),
38                );
39            }
40        }
41        // 2. Remove the script if it is implied by the other subtags
42        if let Some(script) = locale.script() {
43            let default_script = self
44                .likely_subtags
45                .l2s
46                .get_copied(&language.into_tinystr().to_unvalidated())
47                .unwrap_or(DEFAULT_SCRIPT);
48            if let Some(region) = locale.region() {
49                if script
50                    == self
51                        .likely_subtags
52                        .lr2s
53                        .get_copied_2d(
54                            &language.into_tinystr().to_unvalidated(),
55                            &region.into_tinystr().to_unvalidated(),
56                        )
57                        .unwrap_or(default_script)
58                {
59                    locale.set_script(None);
60                }
61            } else if script == default_script {
62                locale.set_script(None);
63            }
64        }
65        // 3. Remove irrelevant extension subtags
66        locale.retain_unicode_ext(|key| {
67            match *key {
68                // Always retain -u-sd
69                SUBDIVISION_KEY => true,
70                // Retain the query-specific keyword
71                _ if Some(*key) == self.config.extension_key => true,
72                // Drop all others
73                _ => false,
74            }
75        });
76        // 4. If there is an invalid "sd" subtag, drop it
77        // For now, ignore it, and let fallback do it for us
78    }
79}
80
81impl<'a> LocaleFallbackIteratorInner<'a> {
82    pub fn step(&mut self, locale: &mut DataLocale) {
83        match self.config.priority {
84            FallbackPriority::Language => self.step_language(locale),
85            FallbackPriority::Region => self.step_region(locale),
86            // TODO(#1964): Change the collation fallback rules to be different
87            // from the language fallback fules.
88            FallbackPriority::Collation => self.step_language(locale),
89            // This case should not normally happen, but `FallbackPriority` is non_exhaustive.
90            // Make it go directly to `und`.
91            _ => {
92                debug_assert!(
93                    false,
94                    "Unknown FallbackPriority: {:?}",
95                    self.config.priority
96                );
97                *locale = Default::default()
98            }
99        }
100    }
101
102    fn step_language(&mut self, locale: &mut DataLocale) {
103        // 1. Remove the extension fallback keyword
104        if let Some(extension_key) = self.config.extension_key {
105            if let Some(value) = locale.remove_unicode_ext(&extension_key) {
106                self.backup_extension = Some(value);
107                return;
108            }
109        }
110        // 2. Remove the subdivision keyword
111        if let Some(value) = locale.remove_unicode_ext(&SUBDIVISION_KEY) {
112            self.backup_subdivision = Some(value);
113            return;
114        }
115        // 3. Assert that the locale is a language identifier
116        debug_assert!(!locale.has_unicode_ext());
117        // 4. Remove variants
118        if locale.has_variants() {
119            self.backup_variants = Some(locale.clear_variants());
120            return;
121        }
122        // 5. Check for parent override
123        if let Some(parent) = self.get_explicit_parent(locale) {
124            locale.set_langid(parent);
125            self.restore_extensions_variants(locale);
126            return;
127        }
128        // 6. Add the script subtag if necessary
129        if locale.script().is_none() {
130            if let Some(region) = locale.region() {
131                let language = locale.language();
132                if let Some(script) = self.likely_subtags.lr2s.get_copied_2d(
133                    &language.into_tinystr().to_unvalidated(),
134                    &region.into_tinystr().to_unvalidated(),
135                ) {
136                    locale.set_script(Some(script));
137                    self.restore_extensions_variants(locale);
138                    return;
139                }
140            }
141        }
142        // 7. Remove region
143        if locale.region().is_some() {
144            locale.set_region(None);
145            self.restore_extensions_variants(locale);
146            return;
147        }
148        // 8. Remove language+script
149        debug_assert!(!locale.language().is_empty()); // don't call .step() on und
150        locale.set_script(None);
151        locale.set_language(Language::UND);
152    }
153
154    fn step_region(&mut self, locale: &mut DataLocale) {
155        // 1. Remove the extension fallback keyword
156        if let Some(extension_key) = self.config.extension_key {
157            if let Some(value) = locale.remove_unicode_ext(&extension_key) {
158                self.backup_extension = Some(value);
159                return;
160            }
161        }
162        // 2. Remove the subdivision keyword
163        if let Some(value) = locale.remove_unicode_ext(&SUBDIVISION_KEY) {
164            self.backup_subdivision = Some(value);
165            return;
166        }
167        // 3. Assert that the locale is a language identifier
168        debug_assert!(!locale.has_unicode_ext());
169        // 4. Remove variants
170        if locale.has_variants() {
171            self.backup_variants = Some(locale.clear_variants());
172            return;
173        }
174        // 5. Remove language+script
175        if !locale.language().is_empty() || locale.script().is_some() {
176            locale.set_script(None);
177            locale.set_language(Language::UND);
178            self.restore_extensions_variants(locale);
179            return;
180        }
181        // 6. Remove region
182        debug_assert!(locale.region().is_some()); // don't call .step() on und
183        locale.set_region(None);
184    }
185
186    fn restore_extensions_variants(&mut self, locale: &mut DataLocale) {
187        if let Some(value) = self.backup_extension.take() {
188            #[allow(clippy::unwrap_used)] // not reachable unless extension_key is present
189            locale.set_unicode_ext(self.config.extension_key.unwrap(), value);
190        }
191        if let Some(value) = self.backup_subdivision.take() {
192            locale.set_unicode_ext(SUBDIVISION_KEY, value);
193        }
194        if let Some(variants) = self.backup_variants.take() {
195            locale.set_variants(variants);
196        }
197    }
198
199    fn get_explicit_parent(&self, locale: &DataLocale) -> Option<LanguageIdentifier> {
200        self.supplement
201            .and_then(|supplement| {
202                supplement
203                    .parents
204                    .get_copied_by(|uvstr| locale.strict_cmp(uvstr).reverse())
205            })
206            .or_else(|| {
207                self.parents
208                    .parents
209                    .get_copied_by(|uvstr| locale.strict_cmp(uvstr).reverse())
210            })
211            .map(LanguageIdentifier::from)
212    }
213}
214
215#[cfg(test)]
216mod tests {
217    use super::*;
218    use writeable::Writeable;
219
220    /// Unicode extension keywords take part in fallback, but [auxiliary keys] are not modified.
221    ///
222    /// [auxiliary keys]: icu_provider::AuxiliaryKeys
223    #[test]
224    fn test_aux_key_fallback() {
225        use super::LocaleFallbacker;
226
227        let fallbacker = LocaleFallbacker::new();
228        let mut fallback_iterator = fallbacker
229            .for_config(Default::default())
230            .fallback_for("en-US-u-sd-usca-x-aux".parse().unwrap());
231
232        assert_eq!(fallback_iterator.get().to_string(), "en-US-u-sd-usca-x-aux");
233        fallback_iterator.step();
234        assert_eq!(fallback_iterator.get().to_string(), "en-US-x-aux");
235        fallback_iterator.step();
236        assert_eq!(fallback_iterator.get().to_string(), "en-u-sd-usca-x-aux");
237        fallback_iterator.step();
238        assert_eq!(fallback_iterator.get().to_string(), "en-x-aux");
239        fallback_iterator.step();
240        assert_eq!(fallback_iterator.get().to_string(), "und-x-aux");
241        assert!(fallback_iterator.get().is_und());
242    }
243
244    struct TestCase {
245        input: &'static str,
246        requires_data: bool,
247        extension_key: Option<Key>,
248        fallback_supplement: Option<LocaleFallbackSupplement>,
249        // Note: The first entry in the chain is the normalized locale
250        expected_language_chain: &'static [&'static str],
251        expected_region_chain: &'static [&'static str],
252    }
253
254    // TODO: Consider loading these from a JSON file
255    const TEST_CASES: &[TestCase] = &[
256        TestCase {
257            input: "en-u-hc-h12-sd-usca",
258            requires_data: false,
259            extension_key: None,
260            fallback_supplement: None,
261            expected_language_chain: &["en-u-sd-usca", "en"],
262            expected_region_chain: &["en-u-sd-usca", "en", "und-u-sd-usca"],
263        },
264        TestCase {
265            input: "en-US-u-hc-h12-sd-usca",
266            requires_data: false,
267            extension_key: None,
268            fallback_supplement: None,
269            expected_language_chain: &["en-US-u-sd-usca", "en-US", "en-u-sd-usca", "en"],
270            expected_region_chain: &["en-US-u-sd-usca", "en-US", "und-US-u-sd-usca", "und-US"],
271        },
272        TestCase {
273            input: "en-US-fonipa-u-hc-h12-sd-usca",
274            requires_data: false,
275            extension_key: Some(key!("hc")),
276            fallback_supplement: None,
277            expected_language_chain: &[
278                "en-US-fonipa-u-hc-h12-sd-usca",
279                "en-US-fonipa-u-sd-usca",
280                "en-US-fonipa",
281                "en-US",
282                "en-fonipa-u-hc-h12-sd-usca",
283                "en-fonipa-u-sd-usca",
284                "en-fonipa",
285                "en",
286            ],
287            expected_region_chain: &[
288                "en-US-fonipa-u-hc-h12-sd-usca",
289                "en-US-fonipa-u-sd-usca",
290                "en-US-fonipa",
291                "en-US",
292                "und-US-fonipa-u-hc-h12-sd-usca",
293                "und-US-fonipa-u-sd-usca",
294                "und-US-fonipa",
295                "und-US",
296            ],
297        },
298        TestCase {
299            input: "en-u-hc-h12-sd-usca",
300            requires_data: true,
301            extension_key: None,
302            fallback_supplement: None,
303            expected_language_chain: &["en-u-sd-usca", "en"],
304            expected_region_chain: &["en-US-u-sd-usca", "en-US", "und-US-u-sd-usca", "und-US"],
305        },
306        TestCase {
307            input: "en-Latn-u-sd-usca",
308            requires_data: true,
309            extension_key: None,
310            fallback_supplement: None,
311            expected_language_chain: &["en-u-sd-usca", "en"],
312            expected_region_chain: &["en-US-u-sd-usca", "en-US", "und-US-u-sd-usca", "und-US"],
313        },
314        TestCase {
315            input: "en-Latn-US-u-sd-usca",
316            requires_data: true,
317            extension_key: None,
318            fallback_supplement: None,
319            expected_language_chain: &["en-US-u-sd-usca", "en-US", "en-u-sd-usca", "en"],
320            expected_region_chain: &["en-US-u-sd-usca", "en-US", "und-US-u-sd-usca", "und-US"],
321        },
322        TestCase {
323            // TODO(#4413): -u-rg is not yet supported; when it is, this test should be updated
324            input: "en-u-rg-gbxxxx",
325            requires_data: false,
326            extension_key: None,
327            fallback_supplement: None,
328            expected_language_chain: &["en"],
329            expected_region_chain: &["en"],
330        },
331        TestCase {
332            input: "sr-ME",
333            requires_data: true,
334            extension_key: None,
335            fallback_supplement: None,
336            expected_language_chain: &["sr-ME", "sr-Latn-ME", "sr-Latn"],
337            expected_region_chain: &["sr-ME", "und-ME"],
338        },
339        TestCase {
340            input: "sr-Latn-ME",
341            requires_data: true,
342            extension_key: None,
343            fallback_supplement: None,
344            expected_language_chain: &["sr-ME", "sr-Latn-ME", "sr-Latn"],
345            expected_region_chain: &["sr-ME", "und-ME"],
346        },
347        TestCase {
348            input: "sr-ME-fonipa",
349            requires_data: true,
350            extension_key: None,
351            fallback_supplement: None,
352            expected_language_chain: &[
353                "sr-ME-fonipa",
354                "sr-ME",
355                "sr-Latn-ME-fonipa",
356                "sr-Latn-ME",
357                "sr-Latn-fonipa",
358                "sr-Latn",
359            ],
360            expected_region_chain: &["sr-ME-fonipa", "sr-ME", "und-ME-fonipa", "und-ME"],
361        },
362        TestCase {
363            input: "sr-RS",
364            requires_data: true,
365            extension_key: None,
366            fallback_supplement: None,
367            expected_language_chain: &["sr-RS", "sr"],
368            expected_region_chain: &["sr-RS", "und-RS"],
369        },
370        TestCase {
371            input: "sr-Cyrl-RS",
372            requires_data: true,
373            extension_key: None,
374            fallback_supplement: None,
375            expected_language_chain: &["sr-RS", "sr"],
376            expected_region_chain: &["sr-RS", "und-RS"],
377        },
378        TestCase {
379            input: "sr-Latn-RS",
380            requires_data: true,
381            extension_key: None,
382            fallback_supplement: None,
383            expected_language_chain: &["sr-Latn-RS", "sr-Latn"],
384            expected_region_chain: &["sr-Latn-RS", "und-RS"],
385        },
386        TestCase {
387            input: "de-Latn-LI",
388            requires_data: true,
389            extension_key: None,
390            fallback_supplement: None,
391            expected_language_chain: &["de-LI", "de"],
392            expected_region_chain: &["de-LI", "und-LI"],
393        },
394        TestCase {
395            input: "ca-ES-valencia",
396            requires_data: true,
397            extension_key: None,
398            fallback_supplement: None,
399            expected_language_chain: &["ca-ES-valencia", "ca-ES", "ca-valencia", "ca"],
400            expected_region_chain: &["ca-ES-valencia", "ca-ES", "und-ES-valencia", "und-ES"],
401        },
402        TestCase {
403            input: "es-AR",
404            requires_data: true,
405            extension_key: None,
406            fallback_supplement: None,
407            expected_language_chain: &["es-AR", "es-419", "es"],
408            expected_region_chain: &["es-AR", "und-AR"],
409        },
410        TestCase {
411            input: "hi-IN",
412            requires_data: true,
413            extension_key: None,
414            fallback_supplement: None,
415            expected_language_chain: &["hi-IN", "hi"],
416            expected_region_chain: &["hi-IN", "und-IN"],
417        },
418        TestCase {
419            input: "hi-Latn-IN",
420            requires_data: true,
421            extension_key: None,
422            fallback_supplement: None,
423            expected_language_chain: &["hi-Latn-IN", "hi-Latn", "en-IN", "en-001", "en"],
424            expected_region_chain: &["hi-Latn-IN", "und-IN"],
425        },
426        TestCase {
427            input: "zh-CN",
428            requires_data: true,
429            extension_key: None,
430            fallback_supplement: None,
431            // Note: "zh-Hans" is not reachable because it is the default script for "zh".
432            // The fallback algorithm does not visit the language-script bundle when the
433            // script is the default for the language
434            expected_language_chain: &["zh-CN", "zh"],
435            expected_region_chain: &["zh-CN", "und-CN"],
436        },
437        TestCase {
438            input: "zh-TW",
439            requires_data: true,
440            extension_key: None,
441            fallback_supplement: None,
442            expected_language_chain: &["zh-TW", "zh-Hant-TW", "zh-Hant"],
443            expected_region_chain: &["zh-TW", "und-TW"],
444        },
445        TestCase {
446            input: "yue-HK",
447            requires_data: true,
448            extension_key: None,
449            fallback_supplement: None,
450            expected_language_chain: &["yue-HK", "yue"],
451            expected_region_chain: &["yue-HK", "und-HK"],
452        },
453        TestCase {
454            input: "yue-HK",
455            requires_data: true,
456            extension_key: None,
457            fallback_supplement: Some(LocaleFallbackSupplement::Collation),
458            expected_language_chain: &["yue-HK", "yue", "zh-Hant", "zh"],
459            expected_region_chain: &["yue-HK", "und-HK"],
460        },
461    ];
462
463    #[test]
464    fn test_fallback() {
465        let fallbacker_no_data = LocaleFallbacker::new_without_data();
466        let fallbacker_no_data = fallbacker_no_data.as_borrowed();
467        let fallbacker_with_data = LocaleFallbacker::new();
468        for cas in TEST_CASES {
469            for (priority, expected_chain) in [
470                (
471                    LocaleFallbackPriority::Language,
472                    cas.expected_language_chain,
473                ),
474                (LocaleFallbackPriority::Region, cas.expected_region_chain),
475            ] {
476                let mut config = LocaleFallbackConfig::default();
477                config.priority = priority;
478                config.extension_key = cas.extension_key;
479                config.fallback_supplement = cas.fallback_supplement;
480                let fallbacker = if cas.requires_data {
481                    fallbacker_with_data
482                } else {
483                    fallbacker_no_data
484                };
485                let mut it = fallbacker
486                    .for_config(config)
487                    .fallback_for(cas.input.parse().unwrap());
488                for &expected in expected_chain {
489                    assert_eq!(
490                        expected,
491                        &*it.get().write_to_string(),
492                        "{:?} ({:?})",
493                        cas.input,
494                        priority
495                    );
496                    it.step();
497                }
498                assert_eq!(
499                    "und",
500                    &*it.get().write_to_string(),
501                    "{:?} ({:?})",
502                    cas.input,
503                    priority
504                );
505            }
506        }
507    }
508}