tinystr/int_ops.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::asciibyte::AsciiByte;
6
7/// Internal helper struct that performs operations on aligned integers.
8/// Supports strings up to 4 bytes long.
9#[repr(transparent)]
10pub struct Aligned4(u32);
11
12impl Aligned4 {
13 /// # Panics
14 /// Panics if N is greater than 4
15 #[inline]
16 pub const fn from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self {
17 let mut bytes = [0; 4];
18 let mut i = 0;
19 // The function documentation defines when panics may occur
20 #[expect(clippy::indexing_slicing)]
21 while i < N {
22 bytes[i] = src[i] as u8;
23 i += 1;
24 }
25 Self(u32::from_ne_bytes(bytes))
26 }
27
28 pub const fn len(&self) -> usize {
29 let word = self.0;
30 #[cfg(target_endian = "little")]
31 let len = (4 - word.leading_zeros() / 8) as usize;
32 #[cfg(target_endian = "big")]
33 let len = (4 - word.trailing_zeros() / 8) as usize;
34 len
35 }
36
37 pub const fn is_ascii_alphabetic(&self) -> bool {
38 let word = self.0;
39 // Each of the following bitmasks set *the high bit* (0x8) to 0 for valid and 1 for invalid.
40 // `mask` sets all NUL bytes to 0.
41 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
42 // `lower` converts the string to lowercase. It may also change the value of non-alpha
43 // characters, but this does not matter for the alphabetic test that follows.
44 let lower = word | 0x2020_2020;
45 // `alpha` sets all alphabetic bytes to 0. We only need check for lowercase characters.
46 let alpha = !(lower + 0x1f1f_1f1f) | (lower + 0x0505_0505);
47 // The overall string is valid if every character passes at least one test.
48 // We performed two tests here: non-NUL (`mask`) and alphabetic (`alpha`).
49 (alpha & mask) == 0
50 }
51
52 pub const fn is_ascii_alphanumeric(&self) -> bool {
53 let word = self.0;
54 // See explanatory comments in is_ascii_alphabetic
55 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
56 let numeric = !(word + 0x5050_5050) | (word + 0x4646_4646);
57 let lower = word | 0x2020_2020;
58 let alpha = !(lower + 0x1f1f_1f1f) | (lower + 0x0505_0505);
59 (alpha & numeric & mask) == 0
60 }
61
62 pub const fn is_ascii_numeric(&self) -> bool {
63 let word = self.0;
64 // See explanatory comments in is_ascii_alphabetic
65 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
66 let numeric = !(word + 0x5050_5050) | (word + 0x4646_4646);
67 (numeric & mask) == 0
68 }
69
70 pub const fn is_ascii_lowercase(&self) -> bool {
71 let word = self.0;
72 // For efficiency, this function tests for an invalid string rather than a valid string.
73 // A string is ASCII lowercase iff it contains no uppercase ASCII characters.
74 // `invalid_case` sets all uppercase ASCII characters to 0 and all others to 1.
75 let invalid_case = !(word + 0x3f3f_3f3f) | (word + 0x2525_2525);
76 // The string is valid if it contains no invalid characters (if all high bits are 1).
77 (invalid_case & 0x8080_8080) == 0x8080_8080
78 }
79
80 pub const fn is_ascii_titlecase(&self) -> bool {
81 let word = self.0;
82 // See explanatory comments in is_ascii_lowercase
83 let invalid_case = if cfg!(target_endian = "little") {
84 !(word + 0x3f3f_3f1f) | (word + 0x2525_2505)
85 } else {
86 !(word + 0x1f3f_3f3f) | (word + 0x0525_2525)
87 };
88 (invalid_case & 0x8080_8080) == 0x8080_8080
89 }
90
91 pub const fn is_ascii_uppercase(&self) -> bool {
92 let word = self.0;
93 // See explanatory comments in is_ascii_lowercase
94 let invalid_case = !(word + 0x1f1f_1f1f) | (word + 0x0505_0505);
95 (invalid_case & 0x8080_8080) == 0x8080_8080
96 }
97
98 pub const fn is_ascii_alphabetic_lowercase(&self) -> bool {
99 let word = self.0;
100 // `mask` sets all NUL bytes to 0.
101 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
102 // `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1.
103 let lower_alpha = !(word + 0x1f1f_1f1f) | (word + 0x0505_0505);
104 // The overall string is valid if every character passes at least one test.
105 // We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`).
106 (lower_alpha & mask) == 0
107 }
108
109 pub const fn is_ascii_alphabetic_titlecase(&self) -> bool {
110 let word = self.0;
111 // See explanatory comments in is_ascii_alphabetic_lowercase
112 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
113 let title_case = if cfg!(target_endian = "little") {
114 !(word + 0x1f1f_1f3f) | (word + 0x0505_0525)
115 } else {
116 !(word + 0x3f1f_1f1f) | (word + 0x2505_0505)
117 };
118 (title_case & mask) == 0
119 }
120
121 pub const fn is_ascii_alphabetic_uppercase(&self) -> bool {
122 let word = self.0;
123 // See explanatory comments in is_ascii_alphabetic_lowercase
124 let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
125 let upper_alpha = !(word + 0x3f3f_3f3f) | (word + 0x2525_2525);
126 (upper_alpha & mask) == 0
127 }
128
129 pub const fn to_ascii_lowercase(&self) -> [AsciiByte; 4] {
130 let word = self.0;
131 let result = word | (((word + 0x3f3f_3f3f) & !(word + 0x2525_2525) & 0x8080_8080) >> 2);
132 unsafe { AsciiByte::to_ascii_byte_array(&result.to_ne_bytes()) }
133 }
134
135 pub const fn to_ascii_titlecase(&self) -> [AsciiByte; 4] {
136 let word = self.0.to_le();
137 let mask = ((word + 0x3f3f_3f1f) & !(word + 0x2525_2505) & 0x8080_8080) >> 2;
138 let result = (word | mask) & !(0x20 & mask);
139 unsafe { AsciiByte::to_ascii_byte_array(&u32::from_le(result).to_ne_bytes()) }
140 }
141
142 pub const fn to_ascii_uppercase(&self) -> [AsciiByte; 4] {
143 let word = self.0;
144 let result = word & !(((word + 0x1f1f_1f1f) & !(word + 0x0505_0505) & 0x8080_8080) >> 2);
145 unsafe { AsciiByte::to_ascii_byte_array(&result.to_ne_bytes()) }
146 }
147}
148
149/// Internal helper struct that performs operations on aligned integers.
150/// Supports strings up to 8 bytes long.
151#[repr(transparent)]
152pub struct Aligned8(u64);
153
154impl Aligned8 {
155 /// # Panics
156 /// Panics if N is greater than 8
157 #[inline]
158 pub const fn from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self {
159 let mut bytes = [0; 8];
160 let mut i = 0;
161 // The function documentation defines when panics may occur
162 #[expect(clippy::indexing_slicing)]
163 while i < N {
164 bytes[i] = src[i] as u8;
165 i += 1;
166 }
167 Self(u64::from_ne_bytes(bytes))
168 }
169
170 pub const fn len(&self) -> usize {
171 let word = self.0;
172 #[cfg(target_endian = "little")]
173 let len = (8 - word.leading_zeros() / 8) as usize;
174 #[cfg(target_endian = "big")]
175 let len = (8 - word.trailing_zeros() / 8) as usize;
176 len
177 }
178
179 pub const fn is_ascii_alphabetic(&self) -> bool {
180 let word = self.0;
181 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
182 let lower = word | 0x2020_2020_2020_2020;
183 let alpha = !(lower + 0x1f1f_1f1f_1f1f_1f1f) | (lower + 0x0505_0505_0505_0505);
184 (alpha & mask) == 0
185 }
186
187 pub const fn is_ascii_alphanumeric(&self) -> bool {
188 let word = self.0;
189 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
190 let numeric = !(word + 0x5050_5050_5050_5050) | (word + 0x4646_4646_4646_4646);
191 let lower = word | 0x2020_2020_2020_2020;
192 let alpha = !(lower + 0x1f1f_1f1f_1f1f_1f1f) | (lower + 0x0505_0505_0505_0505);
193 (alpha & numeric & mask) == 0
194 }
195
196 pub const fn is_ascii_numeric(&self) -> bool {
197 let word = self.0;
198 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
199 let numeric = !(word + 0x5050_5050_5050_5050) | (word + 0x4646_4646_4646_4646);
200 (numeric & mask) == 0
201 }
202
203 pub const fn is_ascii_lowercase(&self) -> bool {
204 let word = self.0;
205 let invalid_case = !(word + 0x3f3f_3f3f_3f3f_3f3f) | (word + 0x2525_2525_2525_2525);
206 (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080
207 }
208
209 pub const fn is_ascii_titlecase(&self) -> bool {
210 let word = self.0;
211 let invalid_case = if cfg!(target_endian = "little") {
212 !(word + 0x3f3f_3f3f_3f3f_3f1f) | (word + 0x2525_2525_2525_2505)
213 } else {
214 !(word + 0x1f3f_3f3f_3f3f_3f3f) | (word + 0x0525_2525_2525_2525)
215 };
216 (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080
217 }
218
219 pub const fn is_ascii_uppercase(&self) -> bool {
220 let word = self.0;
221 let invalid_case = !(word + 0x1f1f_1f1f_1f1f_1f1f) | (word + 0x0505_0505_0505_0505);
222 (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080
223 }
224
225 pub const fn is_ascii_alphabetic_lowercase(&self) -> bool {
226 let word = self.0;
227 // `mask` sets all NUL bytes to 0.
228 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
229 // `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1.
230 let lower_alpha = !(word + 0x1f1f_1f1f_1f1f_1f1f) | (word + 0x0505_0505_0505_0505);
231 // The overall string is valid if every character passes at least one test.
232 // We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`).
233 (lower_alpha & mask) == 0
234 }
235
236 pub const fn is_ascii_alphabetic_titlecase(&self) -> bool {
237 let word = self.0;
238 // See explanatory comments in is_ascii_alphabetic_lowercase
239 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
240 let title_case = if cfg!(target_endian = "little") {
241 !(word + 0x1f1f_1f1f_1f1f_1f3f) | (word + 0x0505_0505_0505_0525)
242 } else {
243 !(word + 0x3f1f_1f1f_1f1f_1f1f) | (word + 0x2505_0505_0505_0505)
244 };
245 (title_case & mask) == 0
246 }
247
248 pub const fn is_ascii_alphabetic_uppercase(&self) -> bool {
249 let word = self.0;
250 // See explanatory comments in is_ascii_alphabetic_lowercase
251 let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
252 let upper_alpha = !(word + 0x3f3f_3f3f_3f3f_3f3f) | (word + 0x2525_2525_2525_2525);
253 (upper_alpha & mask) == 0
254 }
255
256 pub const fn to_ascii_lowercase(&self) -> [AsciiByte; 8] {
257 let word = self.0;
258 let result = word
259 | (((word + 0x3f3f_3f3f_3f3f_3f3f)
260 & !(word + 0x2525_2525_2525_2525)
261 & 0x8080_8080_8080_8080)
262 >> 2);
263 unsafe { AsciiByte::to_ascii_byte_array(&result.to_ne_bytes()) }
264 }
265
266 pub const fn to_ascii_titlecase(&self) -> [AsciiByte; 8] {
267 let word = self.0.to_le();
268 let mask = ((word + 0x3f3f_3f3f_3f3f_3f1f)
269 & !(word + 0x2525_2525_2525_2505)
270 & 0x8080_8080_8080_8080)
271 >> 2;
272 let result = (word | mask) & !(0x20 & mask);
273 unsafe { AsciiByte::to_ascii_byte_array(&u64::from_le(result).to_ne_bytes()) }
274 }
275
276 pub const fn to_ascii_uppercase(&self) -> [AsciiByte; 8] {
277 let word = self.0;
278 let result = word
279 & !(((word + 0x1f1f_1f1f_1f1f_1f1f)
280 & !(word + 0x0505_0505_0505_0505)
281 & 0x8080_8080_8080_8080)
282 >> 2);
283 unsafe { AsciiByte::to_ascii_byte_array(&result.to_ne_bytes()) }
284 }
285}