icu_locid_transform/provider/
canonicalizer.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use super::*;
6use icu_locid::subtags::{Language, Region, Script, Variant};
7use icu_provider::prelude::*;
8use tinystr::UnvalidatedTinyAsciiStr;
9use zerovec::{VarZeroVec, ZeroMap, ZeroSlice};
10
11#[icu_provider::data_struct(marker(AliasesV1Marker, "locid_transform/aliases@1", singleton))]
12#[derive(PartialEq, Clone, Default)]
13#[cfg_attr(
14    feature = "datagen",
15    derive(serde::Serialize, databake::Bake),
16    databake(path = icu_locid_transform::provider),
17)]
18#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
19#[yoke(prove_covariance_manually)]
20/// This alias data is used for locale canonicalization. Each field defines a
21/// mapping from an old identifier to a new identifier, based upon the rules in
22/// from <http://unicode.org/reports/tr35/#LocaleId_Canonicalization>. The data
23/// is stored in sorted order, allowing for binary search to identify rules to
24/// apply. It is broken down into smaller vectors based upon some characteristic
25/// of the data, to help avoid unnecessary searches. For example, the `sgn_region`
26/// field contains aliases for sign language and region, so that it is not
27/// necessary to search the data unless the input is a sign language.
28///
29/// The algorithm in tr35 is not guaranteed to terminate on data other than what
30/// is currently in CLDR. For this reason, it is not a good idea to attempt to add
31/// or modify aliases for use in this structure.
32///
33/// <div class="stab unstable">
34/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
35/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
36/// to be stable, their Rust representation might not be. Use with caution.
37/// </div>
38// TODO: Use validated types as value types
39#[derive(Debug)]
40pub struct AliasesV1<'data> {
41    /// `[language(-variant)+\] -> [langid]`
42    /// This is not a map as it's searched linearly according to the canonicalization rules.
43    #[cfg_attr(feature = "serde", serde(borrow))]
44    pub language_variants: VarZeroVec<'data, UnvalidatedLanguageIdentifierPair>,
45    /// `sgn-[region] -> [language]`
46    #[cfg_attr(feature = "serde", serde(borrow))]
47    pub sgn_region: ZeroMap<'data, UnvalidatedRegion, Language>,
48    /// `[language{2}] -> [langid]`
49    #[cfg_attr(feature = "serde", serde(borrow))]
50    pub language_len2: ZeroMap<'data, UnvalidatedTinyAsciiStr<2>, UnvalidatedLanguageIdentifier>,
51    /// `[language{3}] -> [langid]`
52    #[cfg_attr(feature = "serde", serde(borrow))]
53    pub language_len3: ZeroMap<'data, UnvalidatedLanguage, UnvalidatedLanguageIdentifier>,
54    /// `[langid] -> [langid]`
55    /// This is not a map as it's searched linearly according to the canonicalization rules.
56    #[cfg_attr(feature = "serde", serde(borrow))]
57    pub language: VarZeroVec<'data, UnvalidatedLanguageIdentifierPair>,
58
59    /// `[script] -> [script]`
60    #[cfg_attr(feature = "serde", serde(borrow))]
61    pub script: ZeroMap<'data, UnvalidatedScript, Script>,
62
63    /// `[region{2}] -> [region]`
64    #[cfg_attr(feature = "serde", serde(borrow))]
65    pub region_alpha: ZeroMap<'data, UnvalidatedTinyAsciiStr<2>, Region>,
66    /// `[region{3}] -> [region]`
67    #[cfg_attr(feature = "serde", serde(borrow))]
68    pub region_num: ZeroMap<'data, UnvalidatedRegion, Region>,
69
70    /// `[region] -> [region]+`
71    #[cfg_attr(feature = "serde", serde(borrow))]
72    pub complex_region: ZeroMap<'data, UnvalidatedRegion, ZeroSlice<Region>>,
73
74    /// `[variant] -> [variant]`
75    #[cfg_attr(feature = "serde", serde(borrow))]
76    pub variant: ZeroMap<'data, UnvalidatedVariant, Variant>,
77
78    /// `[value{7}] -> [value{7}]`
79    #[cfg_attr(feature = "serde", serde(borrow))]
80    pub subdivision: ZeroMap<'data, UnvalidatedSubdivision, SemivalidatedSubdivision>,
81}
82
83#[cfg(feature = "datagen")]
84impl<'data> From<AliasesV2<'data>> for AliasesV1<'data> {
85    fn from(value: AliasesV2<'data>) -> Self {
86        let language_variants = value
87            .language_variants
88            .iter()
89            .map(zerofrom::ZeroFrom::zero_from)
90            .map(|v: LanguageStrStrPair| {
91                let langid = alloc::format!("{0}-{1}", v.0, v.1);
92                StrStrPair(langid.into(), v.2)
93            })
94            .collect::<alloc::vec::Vec<StrStrPair>>();
95
96        Self {
97            language_variants: VarZeroVec::from(&language_variants),
98            sgn_region: value.sgn_region,
99            language_len2: value.language_len2,
100            language_len3: value.language_len3,
101            language: value.language,
102            script: value.script,
103            region_alpha: value.region_alpha,
104            region_num: value.region_num,
105            complex_region: value.complex_region,
106            variant: value.variant,
107            subdivision: value.subdivision,
108        }
109    }
110}
111
112impl<'data> TryFrom<AliasesV1<'data>> for AliasesV2<'data> {
113    type Error = icu_provider::DataError;
114
115    fn try_from(value: AliasesV1<'data>) -> Result<Self, Self::Error> {
116        #[allow(unused_imports)]
117        use alloc::borrow::ToOwned;
118
119        let language_variants = value
120            .language_variants
121            .iter()
122            .map(zerofrom::ZeroFrom::zero_from)
123            .map(|v: StrStrPair| -> Result<LanguageStrStrPair, DataError> {
124                let (lang, variant) =
125                    v.0.split_once('-')
126                        .ok_or_else(|| DataError::custom("Each pair should be language-variant"))?;
127                let lang: Language = lang
128                    .parse()
129                    .map_err(|_| DataError::custom("Language should be a valid language subtag"))?;
130                Ok(LanguageStrStrPair(lang, variant.to_owned().into(), v.1))
131            })
132            .collect::<Result<alloc::vec::Vec<_>, _>>()?;
133
134        Ok(Self {
135            language_variants: VarZeroVec::from(&language_variants),
136            sgn_region: value.sgn_region,
137            language_len2: value.language_len2,
138            language_len3: value.language_len3,
139            language: value.language,
140            script: value.script,
141            region_alpha: value.region_alpha,
142            region_num: value.region_num,
143            complex_region: value.complex_region,
144            variant: value.variant,
145            subdivision: value.subdivision,
146        })
147    }
148}
149
150#[icu_provider::data_struct(marker(AliasesV2Marker, "locid_transform/aliases@2", singleton))]
151#[derive(PartialEq, Clone, Default)]
152#[cfg_attr(
153    feature = "datagen",
154    derive(serde::Serialize, databake::Bake),
155    databake(path = icu_locid_transform::provider),
156)]
157#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
158#[yoke(prove_covariance_manually)]
159/// This alias data is used for locale canonicalization. Each field defines a
160/// mapping from an old identifier to a new identifier, based upon the rules in
161/// from <http://unicode.org/reports/tr35/#LocaleId_Canonicalization>. The data
162/// is stored in sorted order, allowing for binary search to identify rules to
163/// apply. It is broken down into smaller vectors based upon some characteristic
164/// of the data, to help avoid unnecessary searches. For example, the `sgn_region`
165/// field contains aliases for sign language and region, so that it is not
166/// necessary to search the data unless the input is a sign language.
167///
168/// The algorithm in tr35 is not guaranteed to terminate on data other than what
169/// is currently in CLDR. For this reason, it is not a good idea to attempt to add
170/// or modify aliases for use in this structure.
171///
172/// <div class="stab unstable">
173/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
174/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
175/// to be stable, their Rust representation might not be. Use with caution.
176/// </div>
177// TODO: Use validated types as value types
178// Notice: V2 improves the alignment of `language_variants` speeding up canonicalization by upon
179// to 40%. See https://github.com/unicode-org/icu4x/pull/2935 for details.
180#[derive(Debug)]
181pub struct AliasesV2<'data> {
182    /// `[language, variant(-variant)*] -> [langid]`
183    /// This is not a map as it's searched linearly according to the canonicalization rules.
184    #[cfg_attr(feature = "serde", serde(borrow))]
185    pub language_variants: VarZeroVec<'data, UnvalidatedLanguageVariantsPair>,
186    /// `sgn-[region] -> [language]`
187    #[cfg_attr(feature = "serde", serde(borrow))]
188    pub sgn_region: ZeroMap<'data, UnvalidatedRegion, Language>,
189    /// `[language{2}] -> [langid]`
190    #[cfg_attr(feature = "serde", serde(borrow))]
191    pub language_len2: ZeroMap<'data, UnvalidatedTinyAsciiStr<2>, UnvalidatedLanguageIdentifier>,
192    /// `[language{3}] -> [langid]`
193    #[cfg_attr(feature = "serde", serde(borrow))]
194    pub language_len3: ZeroMap<'data, UnvalidatedLanguage, UnvalidatedLanguageIdentifier>,
195    /// `[langid] -> [langid]`
196    /// This is not a map as it's searched linearly according to the canonicalization rules.
197    #[cfg_attr(feature = "serde", serde(borrow))]
198    pub language: VarZeroVec<'data, UnvalidatedLanguageIdentifierPair>,
199
200    /// `[script] -> [script]`
201    #[cfg_attr(feature = "serde", serde(borrow))]
202    pub script: ZeroMap<'data, UnvalidatedScript, Script>,
203
204    /// `[region{2}] -> [region]`
205    #[cfg_attr(feature = "serde", serde(borrow))]
206    pub region_alpha: ZeroMap<'data, UnvalidatedTinyAsciiStr<2>, Region>,
207    /// `[region{3}] -> [region]`
208    #[cfg_attr(feature = "serde", serde(borrow))]
209    pub region_num: ZeroMap<'data, UnvalidatedRegion, Region>,
210
211    /// `[region] -> [region]+`
212    #[cfg_attr(feature = "serde", serde(borrow))]
213    pub complex_region: ZeroMap<'data, UnvalidatedRegion, ZeroSlice<Region>>,
214
215    /// `[variant] -> [variant]`
216    #[cfg_attr(feature = "serde", serde(borrow))]
217    pub variant: ZeroMap<'data, UnvalidatedVariant, Variant>,
218
219    /// `[value{7}] -> [value{7}]`
220    #[cfg_attr(feature = "serde", serde(borrow))]
221    pub subdivision: ZeroMap<'data, UnvalidatedSubdivision, SemivalidatedSubdivision>,
222}