Skip to main content

tinystr/
int_ops.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::asciibyte::AsciiByte;
6
7/// Internal helper struct that performs operations on aligned integers.
8/// Supports strings up to 4 bytes long.
9#[repr(transparent)]
10pub struct Aligned4(u32);
11
12impl Aligned4 {
13    /// # Panics
14    /// Panics if N is greater than 4
15    #[inline]
16    pub const fn from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self {
17        let mut bytes = [0; 4];
18        let mut i = 0;
19        // The function documentation defines when panics may occur
20        #[expect(clippy::indexing_slicing)]
21        while i < N {
22            bytes[i] = src[i] as u8;
23            i += 1;
24        }
25        Self(u32::from_ne_bytes(bytes))
26    }
27
28    pub const fn len(&self) -> usize {
29        let word = self.0;
30        #[cfg(target_endian = "little")]
31        let len = (4 - word.leading_zeros() / 8) as usize;
32        #[cfg(target_endian = "big")]
33        let len = (4 - word.trailing_zeros() / 8) as usize;
34        len
35    }
36
37    pub const fn is_ascii_alphabetic(&self) -> bool {
38        let word = self.0;
39        // Each of the following bitmasks set *the high bit* (0x8) to 0 for valid and 1 for invalid.
40        // `mask` sets all NUL bytes to 0.
41        let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
42        // `lower` converts the string to lowercase. It may also change the value of non-alpha
43        // characters, but this does not matter for the alphabetic test that follows.
44        let lower = word | 0x2020_2020;
45        // `alpha` sets all alphabetic bytes to 0. We only need check for lowercase characters.
46        let alpha = !(lower + 0x1f1f_1f1f) | (lower + 0x0505_0505);
47        // The overall string is valid if every character passes at least one test.
48        // We performed two tests here: non-NUL (`mask`) and alphabetic (`alpha`).
49        (alpha & mask) == 0
50    }
51
52    pub const fn is_ascii_alphanumeric(&self) -> bool {
53        let word = self.0;
54        // See explanatory comments in is_ascii_alphabetic
55        let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
56        let numeric = !(word + 0x5050_5050) | (word + 0x4646_4646);
57        let lower = word | 0x2020_2020;
58        let alpha = !(lower + 0x1f1f_1f1f) | (lower + 0x0505_0505);
59        (alpha & numeric & mask) == 0
60    }
61
62    pub const fn is_ascii_numeric(&self) -> bool {
63        let word = self.0;
64        // See explanatory comments in is_ascii_alphabetic
65        let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
66        let numeric = !(word + 0x5050_5050) | (word + 0x4646_4646);
67        (numeric & mask) == 0
68    }
69
70    pub const fn is_ascii_lowercase(&self) -> bool {
71        let word = self.0;
72        // For efficiency, this function tests for an invalid string rather than a valid string.
73        // A string is ASCII lowercase iff it contains no uppercase ASCII characters.
74        // `invalid_case` sets all uppercase ASCII characters to 0 and all others to 1.
75        let invalid_case = !(word + 0x3f3f_3f3f) | (word + 0x2525_2525);
76        // The string is valid if it contains no invalid characters (if all high bits are 1).
77        (invalid_case & 0x8080_8080) == 0x8080_8080
78    }
79
80    pub const fn is_ascii_titlecase(&self) -> bool {
81        let word = self.0;
82        // See explanatory comments in is_ascii_lowercase
83        let invalid_case = if truecfg!(target_endian = "little") {
84            !(word + 0x3f3f_3f1f) | (word + 0x2525_2505)
85        } else {
86            !(word + 0x1f3f_3f3f) | (word + 0x0525_2525)
87        };
88        (invalid_case & 0x8080_8080) == 0x8080_8080
89    }
90
91    pub const fn is_ascii_uppercase(&self) -> bool {
92        let word = self.0;
93        // See explanatory comments in is_ascii_lowercase
94        let invalid_case = !(word + 0x1f1f_1f1f) | (word + 0x0505_0505);
95        (invalid_case & 0x8080_8080) == 0x8080_8080
96    }
97
98    pub const fn is_ascii_alphabetic_lowercase(&self) -> bool {
99        let word = self.0;
100        // `mask` sets all NUL bytes to 0.
101        let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
102        // `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1.
103        let lower_alpha = !(word + 0x1f1f_1f1f) | (word + 0x0505_0505);
104        // The overall string is valid if every character passes at least one test.
105        // We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`).
106        (lower_alpha & mask) == 0
107    }
108
109    pub const fn is_ascii_alphabetic_titlecase(&self) -> bool {
110        let word = self.0;
111        // See explanatory comments in is_ascii_alphabetic_lowercase
112        let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
113        let title_case = if truecfg!(target_endian = "little") {
114            !(word + 0x1f1f_1f3f) | (word + 0x0505_0525)
115        } else {
116            !(word + 0x3f1f_1f1f) | (word + 0x2505_0505)
117        };
118        (title_case & mask) == 0
119    }
120
121    pub const fn is_ascii_alphabetic_uppercase(&self) -> bool {
122        let word = self.0;
123        // See explanatory comments in is_ascii_alphabetic_lowercase
124        let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
125        let upper_alpha = !(word + 0x3f3f_3f3f) | (word + 0x2525_2525);
126        (upper_alpha & mask) == 0
127    }
128
129    pub const fn to_ascii_lowercase(&self) -> [AsciiByte; 4] {
130        let word = self.0;
131        let result = word | (((word + 0x3f3f_3f3f) & !(word + 0x2525_2525) & 0x8080_8080) >> 2);
132        unsafe { AsciiByte::to_ascii_byte_array(&result.to_ne_bytes()) }
133    }
134
135    pub const fn to_ascii_titlecase(&self) -> [AsciiByte; 4] {
136        let word = self.0.to_le();
137        let mask = ((word + 0x3f3f_3f1f) & !(word + 0x2525_2505) & 0x8080_8080) >> 2;
138        let result = (word | mask) & !(0x20 & mask);
139        unsafe { AsciiByte::to_ascii_byte_array(&u32::from_le(result).to_ne_bytes()) }
140    }
141
142    pub const fn to_ascii_uppercase(&self) -> [AsciiByte; 4] {
143        let word = self.0;
144        let result = word & !(((word + 0x1f1f_1f1f) & !(word + 0x0505_0505) & 0x8080_8080) >> 2);
145        unsafe { AsciiByte::to_ascii_byte_array(&result.to_ne_bytes()) }
146    }
147}
148
149/// Internal helper struct that performs operations on aligned integers.
150/// Supports strings up to 8 bytes long.
151#[repr(transparent)]
152pub struct Aligned8(u64);
153
154impl Aligned8 {
155    /// # Panics
156    /// Panics if N is greater than 8
157    #[inline]
158    pub const fn from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self {
159        let mut bytes = [0; 8];
160        let mut i = 0;
161        // The function documentation defines when panics may occur
162        #[expect(clippy::indexing_slicing)]
163        while i < N {
164            bytes[i] = src[i] as u8;
165            i += 1;
166        }
167        Self(u64::from_ne_bytes(bytes))
168    }
169
170    pub const fn len(&self) -> usize {
171        let word = self.0;
172        #[cfg(target_endian = "little")]
173        let len = (8 - word.leading_zeros() / 8) as usize;
174        #[cfg(target_endian = "big")]
175        let len = (8 - word.trailing_zeros() / 8) as usize;
176        len
177    }
178
179    pub const fn is_ascii_alphabetic(&self) -> bool {
180        let word = self.0;
181        let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
182        let lower = word | 0x2020_2020_2020_2020;
183        let alpha = !(lower + 0x1f1f_1f1f_1f1f_1f1f) | (lower + 0x0505_0505_0505_0505);
184        (alpha & mask) == 0
185    }
186
187    pub const fn is_ascii_alphanumeric(&self) -> bool {
188        let word = self.0;
189        let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
190        let numeric = !(word + 0x5050_5050_5050_5050) | (word + 0x4646_4646_4646_4646);
191        let lower = word | 0x2020_2020_2020_2020;
192        let alpha = !(lower + 0x1f1f_1f1f_1f1f_1f1f) | (lower + 0x0505_0505_0505_0505);
193        (alpha & numeric & mask) == 0
194    }
195
196    pub const fn is_ascii_numeric(&self) -> bool {
197        let word = self.0;
198        let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
199        let numeric = !(word + 0x5050_5050_5050_5050) | (word + 0x4646_4646_4646_4646);
200        (numeric & mask) == 0
201    }
202
203    pub const fn is_ascii_lowercase(&self) -> bool {
204        let word = self.0;
205        let invalid_case = !(word + 0x3f3f_3f3f_3f3f_3f3f) | (word + 0x2525_2525_2525_2525);
206        (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080
207    }
208
209    pub const fn is_ascii_titlecase(&self) -> bool {
210        let word = self.0;
211        let invalid_case = if truecfg!(target_endian = "little") {
212            !(word + 0x3f3f_3f3f_3f3f_3f1f) | (word + 0x2525_2525_2525_2505)
213        } else {
214            !(word + 0x1f3f_3f3f_3f3f_3f3f) | (word + 0x0525_2525_2525_2525)
215        };
216        (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080
217    }
218
219    pub const fn is_ascii_uppercase(&self) -> bool {
220        let word = self.0;
221        let invalid_case = !(word + 0x1f1f_1f1f_1f1f_1f1f) | (word + 0x0505_0505_0505_0505);
222        (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080
223    }
224
225    pub const fn is_ascii_alphabetic_lowercase(&self) -> bool {
226        let word = self.0;
227        // `mask` sets all NUL bytes to 0.
228        let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
229        // `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1.
230        let lower_alpha = !(word + 0x1f1f_1f1f_1f1f_1f1f) | (word + 0x0505_0505_0505_0505);
231        // The overall string is valid if every character passes at least one test.
232        // We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`).
233        (lower_alpha & mask) == 0
234    }
235
236    pub const fn is_ascii_alphabetic_titlecase(&self) -> bool {
237        let word = self.0;
238        // See explanatory comments in is_ascii_alphabetic_lowercase
239        let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
240        let title_case = if truecfg!(target_endian = "little") {
241            !(word + 0x1f1f_1f1f_1f1f_1f3f) | (word + 0x0505_0505_0505_0525)
242        } else {
243            !(word + 0x3f1f_1f1f_1f1f_1f1f) | (word + 0x2505_0505_0505_0505)
244        };
245        (title_case & mask) == 0
246    }
247
248    pub const fn is_ascii_alphabetic_uppercase(&self) -> bool {
249        let word = self.0;
250        // See explanatory comments in is_ascii_alphabetic_lowercase
251        let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
252        let upper_alpha = !(word + 0x3f3f_3f3f_3f3f_3f3f) | (word + 0x2525_2525_2525_2525);
253        (upper_alpha & mask) == 0
254    }
255
256    pub const fn to_ascii_lowercase(&self) -> [AsciiByte; 8] {
257        let word = self.0;
258        let result = word
259            | (((word + 0x3f3f_3f3f_3f3f_3f3f)
260                & !(word + 0x2525_2525_2525_2525)
261                & 0x8080_8080_8080_8080)
262                >> 2);
263        unsafe { AsciiByte::to_ascii_byte_array(&result.to_ne_bytes()) }
264    }
265
266    pub const fn to_ascii_titlecase(&self) -> [AsciiByte; 8] {
267        let word = self.0.to_le();
268        let mask = ((word + 0x3f3f_3f3f_3f3f_3f1f)
269            & !(word + 0x2525_2525_2525_2505)
270            & 0x8080_8080_8080_8080)
271            >> 2;
272        let result = (word | mask) & !(0x20 & mask);
273        unsafe { AsciiByte::to_ascii_byte_array(&u64::from_le(result).to_ne_bytes()) }
274    }
275
276    pub const fn to_ascii_uppercase(&self) -> [AsciiByte; 8] {
277        let word = self.0;
278        let result = word
279            & !(((word + 0x1f1f_1f1f_1f1f_1f1f)
280                & !(word + 0x0505_0505_0505_0505)
281                & 0x8080_8080_8080_8080)
282                >> 2);
283        unsafe { AsciiByte::to_ascii_byte_array(&result.to_ne_bytes()) }
284    }
285}