icu_properties/provider/
bidi_data.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
6//!
7//! <div class="stab unstable">
8//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
9//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
10//! to be stable, their Rust representation might not be. Use with caution.
11//! </div>
12//!
13//! Read more about data providers: [`icu_provider`]
14//!
15//! This module provides an efficient storage of data serving the following
16//! properties:
17//! - `Bidi_Paired_Bracket`
18//! - `Bidi_Paired_Bracket_Type`
19//! - `Bidi_Mirrored`
20//! - `Bidi_Mirroring_Glyph`
21
22use displaydoc::Display;
23use icu_collections::codepointtrie::{CodePointTrie, TrieValue};
24use icu_provider::prelude::*;
25use zerovec::ule::{AsULE, CharULE, ULE};
26use zerovec::ZeroVecError;
27
28/// A data provider struct for properties related to Bidi algorithms, including
29/// mirroring and bracket pairing.
30///
31/// <div class="stab unstable">
32/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
33/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
34/// to be stable, their Rust representation might not be. Use with caution.
35/// </div>
36#[icu_provider::data_struct(marker(
37    BidiAuxiliaryPropertiesV1Marker,
38    "props/bidiauxiliaryprops@1",
39    singleton
40))]
41#[derive(Debug, Eq, PartialEq, Clone)]
42#[cfg_attr(
43    feature = "datagen", 
44    derive(serde::Serialize, databake::Bake),
45    databake(path = icu_properties::provider::bidi_data),
46)]
47#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
48pub struct BidiAuxiliaryPropertiesV1<'data> {
49    /// A `CodePointTrie` efficiently storing the data from which property values
50    /// can be extracted or derived for the supported Bidi properties.
51    #[cfg_attr(feature = "serde", serde(borrow))]
52    pub trie: CodePointTrie<'data, MirroredPairedBracketData>,
53}
54
55impl<'data> BidiAuxiliaryPropertiesV1<'data> {
56    #[doc(hidden)]
57    pub fn new(
58        trie: CodePointTrie<'data, MirroredPairedBracketData>,
59    ) -> BidiAuxiliaryPropertiesV1<'data> {
60        BidiAuxiliaryPropertiesV1 { trie }
61    }
62}
63
64#[derive(Copy, Clone, Debug, PartialEq, Eq, Ord, PartialOrd)]
65#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
66#[cfg_attr(feature = "datagen", derive(databake::Bake))]
67#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider::bidi_data))]
68#[doc(hidden)] // needed for datagen but not intended for users
69pub struct MirroredPairedBracketData {
70    pub mirroring_glyph: char,
71    pub mirrored: bool,
72    pub paired_bracket_type: CheckedBidiPairedBracketType,
73}
74
75impl Default for MirroredPairedBracketData {
76    fn default() -> Self {
77        Self {
78            mirroring_glyph: 0 as char,
79            mirrored: false,
80            paired_bracket_type: CheckedBidiPairedBracketType::None,
81        }
82    }
83}
84
85impl From<MirroredPairedBracketData> for u32 {
86    fn from(mpbd: MirroredPairedBracketData) -> u32 {
87        let mut result = mpbd.mirroring_glyph as u32;
88        result |= (mpbd.mirrored as u32) << 21;
89        result |= (mpbd.paired_bracket_type as u32) << 22;
90        result
91    }
92}
93
94/// A `u32` serialized value of `MirroredPairedBracketData` did not encode either a valid Bidi_Mirroring_Glyph or a valid Bidi_Paired_Bracket_Type
95#[derive(Display, Debug, Clone, Copy, PartialEq, Eq)]
96#[displaydoc("Invalid MirroredPairedBracketData serialized in int: {0}")]
97pub struct MirroredPairedBracketDataTryFromError(u32);
98
99impl TryFrom<u32> for MirroredPairedBracketData {
100    type Error = MirroredPairedBracketDataTryFromError;
101
102    fn try_from(i: u32) -> Result<Self, MirroredPairedBracketDataTryFromError> {
103        let code_point = i & 0x1FFFFF;
104        let mirroring_glyph =
105            char::try_from_u32(code_point).map_err(|_| MirroredPairedBracketDataTryFromError(i))?;
106        let mirrored = ((i >> 21) & 0x1) == 1;
107        let paired_bracket_type = {
108            let value = ((i >> 22) & 0x3) as u8;
109            match value {
110                0 => CheckedBidiPairedBracketType::None,
111                1 => CheckedBidiPairedBracketType::Open,
112                2 => CheckedBidiPairedBracketType::Close,
113                _ => {
114                    return Err(MirroredPairedBracketDataTryFromError(i));
115                }
116            }
117        };
118        Ok(MirroredPairedBracketData {
119            mirroring_glyph,
120            mirrored,
121            paired_bracket_type,
122        })
123    }
124}
125
126/// A closed Rust enum representing a closed set of the incoming Bidi_Paired_Bracket_Type
127/// property values necessary in the internal representation of `MirroredPairedBracketData`
128/// to satisfy the ULE invariants on valid values.
129#[derive(Copy, Clone, Debug, PartialEq, Eq, Ord, PartialOrd)]
130#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
131#[cfg_attr(feature = "datagen", derive(databake::Bake))]
132#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider::bidi_data))]
133#[repr(u8)]
134#[zerovec::make_ule(CheckedBidiPairedBracketTypeULE)]
135// This enum is closed in order to help with ULE validation for MirroredPairedBracketData.
136#[allow(clippy::exhaustive_enums)]
137pub enum CheckedBidiPairedBracketType {
138    /// Not a paired bracket.
139    None = 0,
140    /// Open paired bracket.
141    Open = 1,
142    /// Close paired bracket.
143    Close = 2,
144}
145
146/// Bit layout for the 24 bits (0..=23) of the `[u8; 3]` ULE raw type.
147/// LE means first byte is 0..=7, second byte 8..=15, third byte is 16..=23
148///  0..=20  Code point return value for Bidi_Mirroring_Glyph value
149///    extracted with: mask = 0x1FFFFF <=> [bytes[0], bytes[1], bytes[2] & 0x1F]
150///  21..=21 Boolean for Bidi_Mirrored
151///    extracted with: bitshift right by 21 followed by mask = 0x1 <=> (bytes[2] >> 5) & 0x1
152///  22..=23 Enum discriminant value for Bidi_Paired_Bracket_Type
153///    extracted with: bitshift right by 22 followed by mask = 0x3 <=> (bytes[2] >> 6) & 0x3
154///                    <=> (bytes[2] >> 6) b/c we left fill with 0s on bitshift right for unsigned
155///                         numbers and a byte has 8 bits
156#[doc(hidden)]
157/// needed for datagen but not intended for users
158#[derive(Copy, Clone, Hash, PartialEq, Eq, Debug)]
159#[repr(C, packed)]
160pub struct MirroredPairedBracketDataULE([u8; 3]);
161
162// Safety (based on the safety checklist on the ULE trait):
163//  1. MirroredPairedBracketDataULE does not include any uninitialized or padding bytes
164//     (achieved by `#[repr(transparent)]` on a type that satisfies this invariant)
165//  2. MirroredPairedBracketDataULE is aligned to 1 byte.
166//     (achieved by `#[repr(transparent)]` on a type that satisfies this invariant)
167//  3. The impl of validate_byte_slice() returns an error if any byte is not valid.
168//  4. The impl of validate_byte_slice() returns an error if there are extra bytes.
169//  5. The other ULE methods use the default impl.
170//  6. MirroredPairedBracketDataULE byte equality is semantic equality because all bits
171//     are used, so no unused bits requires no extra work to zero out unused bits
172unsafe impl ULE for MirroredPairedBracketDataULE {
173    #[inline]
174    fn validate_byte_slice(bytes: &[u8]) -> Result<(), ZeroVecError> {
175        if bytes.len() % 3 != 0 {
176            return Err(ZeroVecError::length::<Self>(bytes.len()));
177        }
178        // Validate the bytes
179        #[allow(clippy::indexing_slicing)] // Won't panic because the chunks are always 3 bytes long
180        for byte_triple in bytes.chunks_exact(3) {
181            // Bidi_Mirroring_Glyph validation
182            #[allow(clippy::unwrap_used)] // chunks_exact returns slices of length 3
183            let [byte0, byte1, byte2] = *<&[u8; 3]>::try_from(byte_triple).unwrap();
184            let mut mirroring_glyph_code_point: u32 = (byte2 & 0x1F) as u32;
185            mirroring_glyph_code_point = (mirroring_glyph_code_point << 8) | (byte1 as u32);
186            mirroring_glyph_code_point = (mirroring_glyph_code_point << 8) | (byte0 as u32);
187            let _mirroring_glyph =
188                char::from_u32(mirroring_glyph_code_point).ok_or(ZeroVecError::parse::<Self>())?;
189
190            // skip validating the Bidi_Mirrored boolean since it is always valid
191
192            // assert that Bidi_Paired_Bracket_Type cannot have a 4th value because it only
193            // has 3 values: Open, Close, None
194            if (byte2 & 0xC0) == 0xC0 {
195                return Err(ZeroVecError::parse::<Self>());
196            }
197        }
198
199        Ok(())
200    }
201}
202
203impl AsULE for MirroredPairedBracketData {
204    type ULE = MirroredPairedBracketDataULE;
205
206    #[inline]
207    fn to_unaligned(self) -> Self::ULE {
208        let mut ch = u32::from(self.mirroring_glyph);
209        ch |= u32::from(self.mirrored) << 21;
210        ch |= (self.paired_bracket_type as u32) << 22;
211        let [byte0, byte1, byte2, _] = ch.to_le_bytes();
212        MirroredPairedBracketDataULE([byte0, byte1, byte2])
213    }
214
215    #[inline]
216    fn from_unaligned(unaligned: Self::ULE) -> Self {
217        let [unaligned_byte0, unaligned_byte1, unaligned_byte2] = unaligned.0;
218        let mirroring_glyph_ule_bytes = &[unaligned_byte0, unaligned_byte1, unaligned_byte2 & 0x1F];
219        // Safe because the lower bits 20..0 of MirroredPairedBracketDataULE bytes are the CharULE bytes,
220        // and CharULE::from_unaligned is safe because bytes are defined to represent a valid Unicode code point.
221        let mirroring_glyph_ule =
222            unsafe { CharULE::from_byte_slice_unchecked(mirroring_glyph_ule_bytes) };
223        let mirroring_glyph = mirroring_glyph_ule
224            .first()
225            .map(|ule| char::from_unaligned(*ule))
226            .unwrap_or(char::REPLACEMENT_CHARACTER);
227        let mirrored = ((unaligned.0[2] >> 5) & 0x1) == 1;
228        let paired_bracket_type = {
229            let discriminant = unaligned.0[2] >> 6;
230            debug_assert!(
231                discriminant != 3,
232                "Bidi_Paired_Bracket_Type can only be Open/Close/None in MirroredPairedBracketData"
233            );
234            match discriminant {
235                1 => CheckedBidiPairedBracketType::Open,
236                2 => CheckedBidiPairedBracketType::Close,
237                _ => CheckedBidiPairedBracketType::None,
238            }
239        };
240
241        MirroredPairedBracketData {
242            mirroring_glyph,
243            mirrored,
244            paired_bracket_type,
245        }
246    }
247}
248
249#[cfg(test)]
250mod tests {
251    use super::*;
252
253    #[test]
254    fn test_parse() {
255        // data for U+007B LEFT CURLY BRACKET
256
257        // serialize to ULE bytes
258        let data = MirroredPairedBracketData {
259            mirroring_glyph: '}',
260            mirrored: true,
261            paired_bracket_type: CheckedBidiPairedBracketType::Open,
262        };
263        let expected_bytes = &[0x7D, 0x0, 0x60];
264        assert_eq!(
265            expected_bytes,
266            MirroredPairedBracketDataULE::as_byte_slice(&[data.to_unaligned()])
267        );
268
269        // deserialize from ULE bytes
270        let ule = MirroredPairedBracketDataULE::parse_byte_slice(expected_bytes).unwrap();
271        let parsed_data = MirroredPairedBracketData::from_unaligned(*ule.first().unwrap());
272        assert_eq!(data, parsed_data);
273    }
274
275    #[test]
276    fn test_parse_error() {
277        // data for U+007B LEFT CURLY BRACKET
278        let ule_bytes = &mut [0x7D, 0x0, 0x60];
279
280        // Set discriminant value for the CheckedBidiPairedBracketType enum to be invalid.
281        // CheckedBidiPairedBracketType only has 3 values (discriminants => 0..=2), so the 4th
282        // expressible value from the 2 bits (3) should not parse successfully.
283        ule_bytes[2] |= 0xC0;
284
285        // deserialize from ULE bytes
286        let ule_parse_result = MirroredPairedBracketDataULE::parse_byte_slice(ule_bytes);
287        assert!(ule_parse_result.is_err());
288    }
289}