zmij/
lib.rs

1//! [![github]](https://github.com/dtolnay/zmij) [![crates-io]](https://crates.io/crates/zmij) [![docs-rs]](https://docs.rs/zmij)
2//!
3//! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github
4//! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust
5//! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs
6//!
7//! <br>
8//!
9//! A double-to-string conversion algorithm based on [Schubfach] and [yy].
10//!
11//! This Rust implementation is a line-by-line port of Victor Zverovich's
12//! implementation in C++, <https://github.com/vitaut/zmij>.
13//!
14//! [Schubfach]: https://fmt.dev/papers/Schubfach4.pdf
15//! [yy]: https://github.com/ibireme/c_numconv_benchmark/blob/master/vendor/yy_double/yy_double.c
16//!
17//! <br>
18//!
19//! # Example
20//!
21//! ```
22//! fn main() {
23//!     let mut buffer = zmij::Buffer::new();
24//!     let printed = buffer.format(1.234);
25//!     assert_eq!(printed, "1.234");
26//! }
27//! ```
28//!
29//! <br>
30//!
31//! ## Performance
32//!
33//! The [dtoa-benchmark] compares this library and other Rust floating point
34//! formatting implementations across a range of precisions. The vertical axis
35//! in this chart shows nanoseconds taken by a single execution of
36//! `zmij::Buffer::new().format_finite(value)` so a lower result indicates a
37//! faster library.
38//!
39//! [dtoa-benchmark]: https://github.com/dtolnay/dtoa-benchmark
40//!
41//! ![performance](https://raw.githubusercontent.com/dtolnay/zmij/master/dtoa-benchmark.png)
42
43#![no_std]
44#![doc(html_root_url = "https://docs.rs/zmij/1.0.12")]
45#![deny(unsafe_op_in_unsafe_fn)]
46#![allow(non_camel_case_types)]
47#![allow(
48    clippy::blocks_in_conditions,
49    clippy::cast_possible_truncation,
50    clippy::cast_possible_wrap,
51    clippy::cast_ptr_alignment,
52    clippy::cast_sign_loss,
53    clippy::doc_markdown,
54    clippy::incompatible_msrv,
55    clippy::items_after_statements,
56    clippy::must_use_candidate,
57    clippy::needless_doctest_main,
58    clippy::never_loop,
59    clippy::redundant_else,
60    clippy::similar_names,
61    clippy::too_many_lines,
62    clippy::unreadable_literal,
63    clippy::while_immutable_condition,
64    clippy::wildcard_imports
65)]
66
67#[cfg(test)]
68mod tests;
69mod traits;
70
71#[cfg(all(any(target_arch = "aarch64", target_arch = "x86_64"), not(miri)))]
72use core::arch::asm;
73#[cfg(not(zmij_no_select_unpredictable))]
74use core::hint;
75use core::mem::{self, MaybeUninit};
76use core::ptr;
77use core::slice;
78use core::str;
79#[cfg(feature = "no-panic")]
80use no_panic::no_panic;
81
82const BUFFER_SIZE: usize = 24;
83const NAN: &str = "NaN";
84const INFINITY: &str = "inf";
85const NEG_INFINITY: &str = "-inf";
86
87// A decimal floating-point number sig * pow(10, exp).
88// If exp is non_finite_exp then the number is a NaN or an infinity.
89struct dec_fp {
90    sig: i64, // significand
91    exp: i32, // exponent
92}
93
94#[cfg_attr(test, derive(Debug, PartialEq))]
95struct uint128 {
96    hi: u64,
97    lo: u64,
98}
99
100// Computes 128-bit result of multiplication of two 64-bit unsigned integers.
101const fn umul128(x: u64, y: u64) -> u128 {
102    x as u128 * y as u128
103}
104
105const fn umul128_upper64(x: u64, y: u64) -> u64 {
106    (umul128(x, y) >> 64) as u64
107}
108
109#[cfg_attr(feature = "no-panic", no_panic)]
110fn umul192_upper128(x_hi: u64, x_lo: u64, y: u64) -> uint128 {
111    let p = umul128(x_hi, y);
112    let lo = (p as u64).wrapping_add((umul128(x_lo, y) >> 64) as u64);
113    uint128 {
114        hi: (p >> 64) as u64 + u64::from(lo < p as u64),
115        lo,
116    }
117}
118
119// Computes upper 64 bits of multiplication of x and y, discards the least
120// significant bit and rounds to odd, where x = uint128_t(x_hi << 64) | x_lo.
121#[cfg_attr(feature = "no-panic", no_panic)]
122fn umul_upper_inexact_to_odd<UInt>(x_hi: u64, x_lo: u64, y: UInt) -> UInt
123where
124    UInt: traits::UInt,
125{
126    let num_bits = mem::size_of::<UInt>() * 8;
127    if num_bits == 64 {
128        let p = umul192_upper128(x_hi, x_lo, y.into());
129        UInt::truncate(p.hi | u64::from((p.lo >> 1) != 0))
130    } else {
131        let p = (umul128(x_hi, y.into()) >> 32) as u64;
132        UInt::enlarge((p >> 32) as u32 | u32::from((p as u32 >> 1) != 0))
133    }
134}
135
136trait FloatTraits: traits::Float {
137    const NUM_BITS: i32;
138    const NUM_SIG_BITS: i32 = Self::MANTISSA_DIGITS as i32 - 1;
139    const NUM_EXP_BITS: i32 = Self::NUM_BITS - Self::NUM_SIG_BITS - 1;
140    const EXP_MASK: i32 = (1 << Self::NUM_EXP_BITS) - 1;
141    const EXP_BIAS: i32 = (1 << (Self::NUM_EXP_BITS - 1)) - 1;
142
143    type SigType: traits::UInt;
144    const IMPLICIT_BIT: Self::SigType;
145
146    fn to_bits(self) -> Self::SigType;
147
148    fn is_negative(bits: Self::SigType) -> bool {
149        (bits >> (Self::NUM_BITS - 1)) != Self::SigType::from(0)
150    }
151
152    fn get_sig(bits: Self::SigType) -> Self::SigType {
153        bits & (Self::IMPLICIT_BIT - Self::SigType::from(1))
154    }
155
156    fn get_exp(bits: Self::SigType) -> i32 {
157        (bits >> Self::NUM_SIG_BITS).into() as i32 & Self::EXP_MASK
158    }
159}
160
161impl FloatTraits for f32 {
162    const NUM_BITS: i32 = 32;
163    const IMPLICIT_BIT: u32 = 1 << Self::NUM_SIG_BITS;
164
165    type SigType = u32;
166
167    fn to_bits(self) -> Self::SigType {
168        self.to_bits()
169    }
170}
171
172impl FloatTraits for f64 {
173    const NUM_BITS: i32 = 64;
174    const IMPLICIT_BIT: u64 = 1 << Self::NUM_SIG_BITS;
175
176    type SigType = u64;
177
178    fn to_bits(self) -> Self::SigType {
179        self.to_bits()
180    }
181}
182
183struct Pow10SignificandsTable {
184    data: [u64; Self::NUM_POW10 * 2],
185}
186
187impl Pow10SignificandsTable {
188    const NUM_POW10: usize = 617;
189    const SPLIT_TABLES: bool = true;
190
191    unsafe fn get_unchecked(&self, dec_exp: i32) -> uint128 {
192        const DEC_EXP_MIN: i32 = -292;
193        if !Self::SPLIT_TABLES {
194            let index = ((dec_exp - DEC_EXP_MIN) * 2) as usize;
195            return uint128 {
196                hi: unsafe { *self.data.get_unchecked(index) },
197                lo: unsafe { *self.data.get_unchecked(index + 1) },
198            };
199        }
200
201        unsafe {
202            #[cfg_attr(
203                not(all(any(target_arch = "x86_64", target_arch = "aarch64"), not(miri))),
204                allow(unused_mut)
205            )]
206            let mut hi = self
207                .data
208                .as_ptr()
209                .offset(Self::NUM_POW10 as isize + DEC_EXP_MIN as isize - 1);
210            #[cfg_attr(
211                not(all(any(target_arch = "x86_64", target_arch = "aarch64"), not(miri))),
212                allow(unused_mut)
213            )]
214            let mut lo = hi.add(Self::NUM_POW10);
215
216            // Force indexed loads.
217            #[cfg(all(any(target_arch = "x86_64", target_arch = "aarch64"), not(miri)))]
218            asm!("/*{0}{1}*/", inout(reg) hi, inout(reg) lo);
219            uint128 {
220                hi: *hi.offset(-dec_exp as isize),
221                lo: *lo.offset(-dec_exp as isize),
222            }
223        }
224    }
225
226    #[cfg(test)]
227    fn get(&self, dec_exp: i32) -> uint128 {
228        const DEC_EXP_MIN: i32 = -292;
229        assert!((DEC_EXP_MIN..DEC_EXP_MIN + Self::NUM_POW10 as i32).contains(&dec_exp));
230        unsafe { self.get_unchecked(dec_exp) }
231    }
232}
233
234// 128-bit significands of powers of 10 rounded down.
235// Generated using 192-bit arithmetic method by Dougall Johnson.
236static POW10_SIGNIFICANDS: Pow10SignificandsTable = {
237    let mut data = [0; Pow10SignificandsTable::NUM_POW10 * 2];
238
239    struct uint192 {
240        w0: u64, // least significant
241        w1: u64,
242        w2: u64, // most significant
243    }
244
245    // first element, rounded up to cancel out rounding down in the
246    // multiplication, and minimise significant bits
247    let mut current = uint192 {
248        w0: 0xe000000000000000,
249        w1: 0x25e8e89c13bb0f7a,
250        w2: 0xff77b1fcbebcdc4f,
251    };
252    let ten = 0xa000000000000000;
253    let mut i = 0;
254    while i < Pow10SignificandsTable::NUM_POW10 {
255        if Pow10SignificandsTable::SPLIT_TABLES {
256            data[Pow10SignificandsTable::NUM_POW10 - i - 1] = current.w2;
257            data[Pow10SignificandsTable::NUM_POW10 * 2 - i - 1] = current.w1;
258        } else {
259            data[i * 2] = current.w2;
260            data[i * 2 + 1] = current.w1;
261        }
262
263        let h0: u64 = umul128_upper64(current.w0, ten);
264        let h1: u64 = umul128_upper64(current.w1, ten);
265
266        let c0: u64 = h0.wrapping_add(current.w1.wrapping_mul(ten));
267        let c1: u64 = ((c0 < h0) as u64 + h1).wrapping_add(current.w2.wrapping_mul(ten));
268        let c2: u64 = (c1 < h1) as u64 + umul128_upper64(current.w2, ten); // dodgy carry
269
270        // normalise
271        if (c2 >> 63) != 0 {
272            current = uint192 {
273                w0: c0,
274                w1: c1,
275                w2: c2,
276            };
277        } else {
278            current = uint192 {
279                w0: c0 << 1,
280                w1: c1 << 1 | c0 >> 63,
281                w2: c2 << 1 | c1 >> 63,
282            };
283        }
284
285        i += 1;
286    }
287
288    Pow10SignificandsTable { data }
289};
290
291#[cfg_attr(feature = "no-panic", no_panic)]
292fn count_trailing_nonzeros(x: u64) -> usize {
293    // We count the number of bytes until there are only zeros left.
294    // The code is equivalent to
295    //    8 - x.leading_zeros() / 8
296    // but if the BSR instruction is emitted (as gcc on x64 does with default
297    // settings), subtracting the constant before dividing allows the compiler
298    // to combine it with the subtraction which it inserts due to BSR counting
299    // in the opposite direction.
300    //
301    // Additionally, the BSR instruction requires a zero check. Since the high
302    // bit is unused we can avoid the zero check by shifting the datum left by
303    // one and inserting a sentinel bit at the end. This can be faster than the
304    // automatically inserted range check.
305    (70 - ((x.to_le() << 1) | 1).leading_zeros()) as usize / 8
306}
307
308// Align data since unaligned access may be slower when crossing a
309// hardware-specific boundary.
310#[repr(C, align(2))]
311struct Digits2([u8; 200]);
312
313static DIGITS2: Digits2 = Digits2(
314    *b"0001020304050607080910111213141516171819\
315       2021222324252627282930313233343536373839\
316       4041424344454647484950515253545556575859\
317       6061626364656667686970717273747576777879\
318       8081828384858687888990919293949596979899",
319);
320
321// Converts value in the range [0, 100) to a string. GCC generates a bit better
322// code when value is pointer-size (https://www.godbolt.org/z/5fEPMT1cc).
323#[cfg_attr(feature = "no-panic", no_panic)]
324unsafe fn digits2(value: usize) -> &'static u16 {
325    if true {
    if !(value < 100) {
        ::core::panicking::panic("assertion failed: value < 100")
    };
};debug_assert!(value < 100);
326
327    #[allow(clippy::cast_ptr_alignment)]
328    unsafe {
329        &*DIGITS2.0.as_ptr().cast::<u16>().add(value)
330    }
331}
332
333#[cfg_attr(feature = "no-panic", no_panic)]
334fn to_bcd8(abcdefgh: u64) -> u64 {
335    // An optimization from Xiang JunBo.
336    // Three steps BCD. Base 10000 -> base 100 -> base 10.
337    // div and mod are evaluated simultaneously as, e.g.
338    //   (abcdefgh / 10000) << 32 + (abcdefgh % 10000)
339    //      == abcdefgh + (2^32 - 10000) * (abcdefgh / 10000)))
340    // where the division on the RHS is implemented by the usual multiply + shift
341    // trick and the fractional bits are masked away.
342    let abcd_efgh = abcdefgh + (0x100000000 - 10000) * ((abcdefgh * 0x68db8bb) >> 40);
343    let ab_cd_ef_gh = abcd_efgh + (0x10000 - 100) * (((abcd_efgh * 0x147b) >> 19) & 0x7f0000007f);
344    let a_b_c_d_e_f_g_h =
345        ab_cd_ef_gh + (0x100 - 10) * (((ab_cd_ef_gh * 0x67) >> 10) & 0xf000f000f000f);
346    a_b_c_d_e_f_g_h.to_be()
347}
348
349unsafe fn write_if_nonzero(buffer: *mut u8, digit: u32) -> *mut u8 {
350    unsafe {
351        *buffer = b'0' + digit as u8;
352        buffer.add(usize::from(digit != 0))
353    }
354}
355
356unsafe fn write8(buffer: *mut u8, value: u64) {
357    unsafe {
358        buffer.cast::<u64>().write_unaligned(value);
359    }
360}
361
362const ZEROS: u64 = 0x0101010101010101 * b'0' as u64;
363
364// Writes a significand consisting of up to 17 decimal digits (16-17 for
365// normals) and removes trailing zeros.
366#[cfg_attr(feature = "no-panic", no_panic)]
367unsafe fn write_significand17(mut buffer: *mut u8, value: u64) -> *mut u8 {
368    #[cfg(all(target_arch = "aarch64", target_feature = "neon", not(miri)))]
369    {
370        use core::arch::aarch64::*;
371
372        // An optimized version for NEON by Dougall Johnson.
373        struct ToStringConstants {
374            mul_const: u64,
375            hundred_million: u64,
376            multipliers32: [i32; 4],
377            multipliers16: [i16; 8],
378        }
379
380        static CONSTANTS: ToStringConstants = ToStringConstants {
381            mul_const: 0xabcc77118461cefd,
382            hundred_million: 100000000,
383            multipliers32: [0x68db8bb, -10000 + 0x10000, 0x147b000, -100 + 0x10000],
384            multipliers16: [0xce0, -10 + 0x100, 0, 0, 0, 0, 0, 0],
385        };
386
387        let mut c = ptr::addr_of!(CONSTANTS);
388
389        // Compiler barrier, or clang doesn't load from memory and generates 15
390        // more instructions
391        let c = unsafe {
392            asm!("/*{0}*/", inout(reg) c);
393            &*c
394        };
395
396        let mut hundred_million = c.hundred_million;
397
398        // Compiler barrier, or clang narrows the load to 32-bit and unpairs it.
399        unsafe {
400            asm!("/*{0}*/", inout(reg) hundred_million);
401        }
402
403        // Equivalent to abbccddee = value / 100000000, ffgghhii = value % 100000000.
404        let mut abbccddee = (umul128(value, c.mul_const) >> 90) as u64;
405        let ffgghhii = value - abbccddee * hundred_million;
406
407        // We could probably make this bit faster, but we're preferring to
408        // reuse the constants for now.
409        let a = (umul128(abbccddee, c.mul_const) >> 90) as u64;
410        abbccddee -= a * hundred_million;
411
412        buffer = unsafe { write_if_nonzero(buffer, a as u32) };
413
414        unsafe {
415            let hundredmillions64: uint64x1_t =
416                mem::transmute::<u64, uint64x1_t>(abbccddee | (ffgghhii << 32));
417            let hundredmillions32: int32x2_t = vreinterpret_s32_u64(hundredmillions64);
418
419            let high_10000: int32x2_t = vreinterpret_s32_u32(vshr_n_u32(
420                vreinterpret_u32_s32(vqdmulh_n_s32(hundredmillions32, c.multipliers32[0])),
421                9,
422            ));
423            let tenthousands: int32x2_t =
424                vmla_n_s32(hundredmillions32, high_10000, c.multipliers32[1]);
425
426            let mut extended: int32x4_t =
427                vreinterpretq_s32_u32(vshll_n_u16(vreinterpret_u16_s32(tenthousands), 0));
428
429            // Compiler barrier, or clang breaks the subsequent MLA into UADDW +
430            // MUL.
431            asm!("/*{:v}*/", inout(vreg) extended);
432
433            let high_100: int32x4_t = vqdmulhq_n_s32(extended, c.multipliers32[2]);
434            let hundreds: int16x8_t =
435                vreinterpretq_s16_s32(vmlaq_n_s32(extended, high_100, c.multipliers32[3]));
436            let high_10: int16x8_t = vqdmulhq_n_s16(hundreds, c.multipliers16[0]);
437            let digits: uint8x16_t = vrev64q_u8(vreinterpretq_u8_s16(vmlaq_n_s16(
438                hundreds,
439                high_10,
440                c.multipliers16[1],
441            )));
442            let ascii: uint16x8_t = vaddq_u16(
443                vreinterpretq_u16_u8(digits),
444                vreinterpretq_u16_s8(vdupq_n_s8(b'0' as i8)),
445            );
446
447            buffer.cast::<uint16x8_t>().write_unaligned(ascii);
448
449            let is_zero: uint16x8_t = vreinterpretq_u16_u8(vceqq_u8(digits, vdupq_n_u8(0)));
450            let zeros: u64 = !vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(is_zero, 4)), 0);
451
452            buffer.add(16 - (zeros.leading_zeros() as usize >> 2))
453        }
454    }
455
456    #[cfg(all(target_arch = "x86_64", target_feature = "sse4.1", not(miri)))]
457    {
458        use core::arch::x86_64::*;
459
460        let abbccddee = (value / 100_000_000) as u32;
461        let ffgghhii = (value % 100_000_000) as u32;
462        let a = abbccddee / 100_000_000;
463        let bbccddee = abbccddee % 100_000_000;
464
465        buffer = unsafe { write_if_nonzero(buffer, a) };
466        unsafe {
467            // This BCD sequence is by Xiang JunBo. It works the same as the one
468            // in to_bc8 but the masking can be avoided by using vector entries
469            // of the right size, and in the last step a shift operation is
470            // avoided by increasing the shift to 32 bits and then using
471            // ...mulhi... to avoid the shift.
472            let x: __m128i = _mm_set_epi64x(i64::from(ffgghhii), i64::from(bbccddee));
473            let y: __m128i = _mm_add_epi64(
474                x,
475                _mm_mul_epu32(
476                    _mm_set1_epi64x((1 << 32) - 10000),
477                    _mm_srli_epi64(_mm_mul_epu32(x, _mm_set1_epi64x(109951163)), 40),
478                ),
479            );
480            let z: __m128i = _mm_add_epi64(
481                y,
482                _mm_mullo_epi32(
483                    _mm_set1_epi32((1 << 16) - 100),
484                    _mm_srli_epi32(_mm_mulhi_epu16(y, _mm_set1_epi16(5243)), 3),
485                ),
486            );
487            let big_endian_bcd: __m128i = _mm_add_epi64(
488                z,
489                _mm_mullo_epi16(
490                    _mm_set1_epi16((1 << 8) - 10),
491                    _mm_mulhi_epu16(z, _mm_set1_epi16(6554)),
492                ),
493            );
494            let bcd: __m128i = _mm_shuffle_epi8(
495                big_endian_bcd,
496                _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7),
497            );
498
499            // convert to ascii
500            let ascii0: __m128i = _mm_set1_epi8(b'0' as i8);
501            let digits = _mm_add_epi8(bcd, ascii0);
502
503            // determine number of leading zeros
504            let mask: u16 = !_mm_movemask_epi8(_mm_cmpeq_epi8(bcd, _mm_setzero_si128())) as u16;
505            let len = 64 - u64::from(mask).leading_zeros();
506
507            // and save result
508            _mm_storeu_si128(buffer.cast::<__m128i>(), digits);
509
510            buffer.add(len as usize)
511        }
512    }
513
514    #[cfg(not(any(
515        all(target_arch = "aarch64", target_feature = "neon", not(miri)),
516        all(target_arch = "x86_64", target_feature = "sse4.1", not(miri)),
517    )))]
518    {
519        // Each digits is denoted by a letter so value is abbccddeeffgghhii where
520        // digit a can be zero.
521        let abbccddee = (value / 100_000_000) as u32;
522        let ffgghhii = (value % 100_000_000) as u32;
523        buffer = unsafe { write_if_nonzero(buffer, abbccddee / 100_000_000) };
524        let bcd = to_bcd8(u64::from(abbccddee % 100_000_000));
525        unsafe {
526            write8(buffer, bcd | ZEROS);
527        }
528        if ffgghhii == 0 {
529            return unsafe { buffer.add(count_trailing_nonzeros(bcd)) };
530        }
531        let bcd = to_bcd8(u64::from(ffgghhii));
532        unsafe {
533            write8(buffer.add(8), bcd | ZEROS);
534            buffer.add(8).add(count_trailing_nonzeros(bcd))
535        }
536    }
537}
538
539// Writes a significand consisting of up to 9 decimal digits (8-9 for normals)
540// and removes trailing zeros.
541#[cfg_attr(feature = "no-panic", no_panic)]
542unsafe fn write_significand9(mut buffer: *mut u8, value: u32) -> *mut u8 {
543    buffer = unsafe { write_if_nonzero(buffer, value / 100_000_000) };
544    let bcd = to_bcd8(u64::from(value % 100_000_000));
545    unsafe {
546        write8(buffer, bcd | ZEROS);
547        buffer.add(count_trailing_nonzeros(bcd))
548    }
549}
550
551// Computes the decimal exponent as floor(log10(2**bin_exp)) if regular or
552// floor(log10(3/4 * 2**bin_exp)) otherwise, without branching.
553const fn compute_dec_exp(bin_exp: i32, regular: bool) -> i32 {
554    if true {
    if !(bin_exp >= -1334 && bin_exp <= 2620) {
        ::core::panicking::panic("assertion failed: bin_exp >= -1334 && bin_exp <= 2620")
    };
};debug_assert!(bin_exp >= -1334 && bin_exp <= 2620);
555    // log10_3_over_4_sig = -log10(3/4) * 2**log10_2_exp rounded to a power of 2
556    const LOG10_3_OVER_4_SIG: i32 = 131_072;
557    // log10_2_sig = round(log10(2) * 2**log10_2_exp)
558    const LOG10_2_SIG: i32 = 315_653;
559    const LOG10_2_EXP: i32 = 20;
560    (bin_exp * LOG10_2_SIG - !regular as i32 * LOG10_3_OVER_4_SIG) >> LOG10_2_EXP
561}
562
563// Computes a shift so that, after scaling by a power of 10, the intermediate
564// result always has a fixed 128-bit fractional part (for double).
565//
566// Different binary exponents can map to the same decimal exponent, but place
567// the decimal point at different bit positions. The shift compensates for this.
568//
569// For example, both 3 * 2**59 and 3 * 2**60 have dec_exp = 2, but dividing by
570// 10^dec_exp puts the decimal point in different bit positions:
571//   3 * 2**59 / 100 = 1.72...e+16  (needs shift = 1 + 1)
572//   3 * 2**60 / 100 = 3.45...e+16  (needs shift = 2 + 1)
573const fn compute_exp_shift(bin_exp: i32, dec_exp: i32) -> i32 {
574    if true {
    if !(dec_exp >= -350 && dec_exp <= 350) {
        ::core::panicking::panic("assertion failed: dec_exp >= -350 && dec_exp <= 350")
    };
};debug_assert!(dec_exp >= -350 && dec_exp <= 350);
575    // log2_pow10_sig = round(log2(10) * 2**log2_pow10_exp) + 1
576    const LOG2_POW10_SIG: i32 = 217_707;
577    const LOG2_POW10_EXP: i32 = 16;
578    // pow10_bin_exp = floor(log2(10**-dec_exp))
579    let pow10_bin_exp = (-dec_exp * LOG2_POW10_SIG) >> LOG2_POW10_EXP;
580    // pow10 = ((pow10_hi << 64) | pow10_lo) * 2**(pow10_bin_exp - 127)
581
582    // Shift to ensure the intermediate result of multiplying by a power of 10
583    // has a fixed 128-bit fractional part. For example, 3 * 2**59 and 3 * 2**60
584    // both have dec_exp = 2 and dividing them by 10**dec_exp would have the
585    // decimal point in different (bit) positions without the shift:
586    //   3 * 2**59 / 100 = 1.72...e+16 (exp_shift = 1 + 1)
587    //   3 * 2**60 / 100 = 3.45...e+16 (exp_shift = 2 + 1)
588    bin_exp + pow10_bin_exp + 1
589}
590
591fn normalize<UInt>(mut dec: dec_fp, subnormal: bool) -> dec_fp
592where
593    UInt: traits::UInt,
594{
595    if !subnormal {
596        return dec;
597    }
598    let num_bits = mem::size_of::<UInt>() * 8;
599    while dec.sig
600        < if num_bits == 64 {
601            10_000_000_000_000_000
602        } else {
603            100_000_000
604        }
605    {
606        dec.sig *= 10;
607        dec.exp -= 1;
608    }
609    dec
610}
611
612// Converts a binary FP number bin_sig * 2**bin_exp to the shortest decimal
613// representation.
614#[cfg_attr(feature = "no-panic", no_panic)]
615fn to_decimal<UInt>(
616    bin_sig: UInt,
617    bin_exp: i32,
618    mut dec_exp: i32,
619    regular: bool,
620    subnormal: bool,
621) -> dec_fp
622where
623    UInt: traits::UInt,
624{
625    let num_bits = mem::size_of::<UInt>() as i32 * 8;
626    // An optimization from yy by Yaoyuan Guo:
627    while regular && !subnormal {
628        let exp_shift = compute_exp_shift(bin_exp, dec_exp);
629        let pow10 = unsafe { POW10_SIGNIFICANDS.get_unchecked(-dec_exp) };
630
631        let integral; // integral part of bin_sig * pow10
632        let fractional; // fractional part of bin_sig * pow10
633        if num_bits == 64 {
634            let p = umul192_upper128(pow10.hi, pow10.lo, (bin_sig << exp_shift).into());
635            integral = UInt::truncate(p.hi);
636            fractional = p.lo;
637        } else {
638            let p = umul128(pow10.hi, (bin_sig << exp_shift).into());
639            integral = UInt::truncate((p >> 64) as u64);
640            fractional = p as u64;
641        }
642        const HALF_ULP: u64 = 1 << 63;
643
644        // Exact half-ulp tie when rounding to nearest integer.
645        if fractional == HALF_ULP {
646            break;
647        }
648
649        #[cfg(all(any(target_arch = "aarch64", target_arch = "x86_64"), not(miri)))]
650        let digit = {
651            // An optimization of integral % 10 by Dougall Johnson. Relies on
652            // range calculation: (max_bin_sig << max_exp_shift) * max_u128.
653            let quo10 = ((u128::from(integral.into()) * ((1 << 64) / 10 + 1)) >> 64) as u64;
654            let mut digit = integral.into() - quo10 * 10;
655            unsafe {
656                asm!("/*{0}*/", inout(reg) digit); // or it narrows to 32-bit and doesn't use madd/msub
657            }
658            digit
659        };
660        #[cfg(not(all(any(target_arch = "aarch64", target_arch = "x86_64"), not(miri))))]
661        let digit = integral.into() % 10;
662
663        // Switch to a fixed-point representation with the least significant
664        // integral digit in the upper bits and fractional digits in the lower
665        // bits.
666        let num_integral_bits = if num_bits == 64 { 4 } else { 32 };
667        let num_fractional_bits = 64 - num_integral_bits;
668        let ten = 10u64 << num_fractional_bits;
669        // Fixed-point remainder of the scaled significand modulo 10.
670        let scaled_sig_mod10 = (digit << num_fractional_bits) | (fractional >> num_integral_bits);
671
672        // scaled_half_ulp = 0.5 * pow10 in the fixed-point format.
673        // dec_exp is chosen so that 10**dec_exp <= 2**bin_exp < 10**(dec_exp + 1).
674        // Since 1ulp == 2**bin_exp it will be in the range [1, 10) after scaling
675        // by 10**dec_exp. Add 1 to combine the shift with division by two.
676        let scaled_half_ulp = pow10.hi >> (num_integral_bits - exp_shift + 1);
677        let upper = scaled_sig_mod10 + scaled_half_ulp;
678
679        // value = 5.0507837461e-27
680        // next  = 5.0507837461000010e-27
681        //
682        // c = integral.fractional' = 50507837461000003.153987... (value)
683        //                            50507837461000010.328635... (next)
684        //          scaled_half_ulp =                 3.587324...
685        //
686        // fractional' = fractional / 2**64, fractional = 2840565642863009226
687        //
688        //      50507837461000000       c               upper     50507837461000010
689        //              s              l|   L             |               S
690        // ───┬────┬────┼────┬────┬────┼*-──┼────┬────┬───*┬────┬────┬────┼-*--┬───
691        //    8    9    0    1    2    3    4    5    6    7    8    9    0 |  1
692        //            └─────────────────┼─────────────────┘                next
693        //                             1ulp
694        //
695        // s - shorter underestimate, S - shorter overestimate
696        // l - longer underestimate,  L - longer overestimate
697
698        if {
699            // Boundary case when rounding down to nearest 10.
700            scaled_sig_mod10 != scaled_half_ulp &&
701            // Near-boundary case when rounding up to nearest 10.
702            // Case where upper != ten is insufficient: 1.342178e+08f.
703            ten.wrapping_sub(upper) > 1 // upper != ten && upper != ten - 1
704        } {
705            let round_up = upper >= ten;
706            let shorter = (integral.into() - digit + u64::from(round_up) * 10) as i64;
707            let longer = (integral.into() + u64::from(fractional >= HALF_ULP)) as i64;
708            let use_shorter = scaled_sig_mod10 <= scaled_half_ulp || round_up;
709            return dec_fp {
710                #[cfg(zmij_no_select_unpredictable)]
711                sig: if use_shorter { shorter } else { longer },
712                #[cfg(not(zmij_no_select_unpredictable))]
713                sig: hint::select_unpredictable(use_shorter, shorter, longer),
714                exp: dec_exp,
715            };
716        }
717        break;
718    }
719
720    dec_exp = compute_dec_exp(bin_exp, regular);
721    let exp_shift = compute_exp_shift(bin_exp, dec_exp);
722    let mut pow10 = unsafe { POW10_SIGNIFICANDS.get_unchecked(-dec_exp) };
723
724    // Fallback to Schubfach to guarantee correctness in boundary cases. This
725    // requires switching to strict overestimates of powers of 10.
726    if num_bits == 64 {
727        pow10.lo += 1;
728    } else {
729        pow10.hi += 1;
730    }
731
732    // Shift the significand so that boundaries are integer.
733    const BOUND_SHIFT: u32 = 2;
734    let bin_sig_shifted = bin_sig << BOUND_SHIFT;
735
736    // Compute the estimates of lower and upper bounds of the rounding interval
737    // by multiplying them by the power of 10 and applying modified rounding.
738    let lsb = bin_sig & UInt::from(1);
739    let lower = (bin_sig_shifted - (UInt::from(regular) + UInt::from(1))) << exp_shift;
740    let lower = umul_upper_inexact_to_odd(pow10.hi, pow10.lo, lower) + lsb;
741    let upper = (bin_sig_shifted + UInt::from(2)) << exp_shift;
742    let upper = umul_upper_inexact_to_odd(pow10.hi, pow10.lo, upper) - lsb;
743
744    // The idea of using a single shorter candidate is by Cassio Neri.
745    // It is less or equal to the upper bound by construction.
746    let shorter = UInt::from(10) * ((upper >> BOUND_SHIFT) / UInt::from(10));
747    if (shorter << BOUND_SHIFT) >= lower {
748        return normalize::<UInt>(
749            dec_fp {
750                sig: shorter.into() as i64,
751                exp: dec_exp,
752            },
753            subnormal,
754        );
755    }
756
757    let scaled_sig = umul_upper_inexact_to_odd(pow10.hi, pow10.lo, bin_sig_shifted << exp_shift);
758    let longer_below = scaled_sig >> BOUND_SHIFT;
759    let longer_above = longer_below + UInt::from(1);
760
761    // Pick the closest of longer_below and longer_above and check if it's in
762    // the rounding interval.
763    let cmp = scaled_sig
764        .wrapping_sub((longer_below + longer_above) << 1)
765        .to_signed();
766    let below_closer = cmp < UInt::from(0).to_signed()
767        || (cmp == UInt::from(0).to_signed() && (longer_below & UInt::from(1)) == UInt::from(0));
768    let below_in = (longer_below << BOUND_SHIFT) >= lower;
769    let dec_sig = if below_closer & below_in {
770        longer_below
771    } else {
772        longer_above
773    };
774    normalize::<UInt>(
775        dec_fp {
776            sig: dec_sig.into() as i64,
777            exp: dec_exp,
778        },
779        subnormal,
780    )
781}
782
783/// Writes the shortest correctly rounded decimal representation of `value` to
784/// `buffer`. `buffer` should point to a buffer of size `buffer_size` or larger.
785#[cfg_attr(feature = "no-panic", no_panic)]
786unsafe fn write<Float>(value: Float, mut buffer: *mut u8) -> *mut u8
787where
788    Float: FloatTraits,
789{
790    let bits = value.to_bits();
791    let raw_exp = Float::get_exp(bits); // binary exponent
792    let mut bin_exp = raw_exp - Float::NUM_SIG_BITS - Float::EXP_BIAS;
793    // Compute the decimal exponent early to overlap its latency with other work.
794    let mut dec_exp = compute_dec_exp(bin_exp, true);
795
796    unsafe {
797        *buffer = b'-';
798    }
799    buffer = unsafe { buffer.add(usize::from(Float::is_negative(bits))) };
800
801    let mut bin_sig = Float::get_sig(bits); // binary significand
802    let mut regular = bin_sig != Float::SigType::from(0);
803    let special = raw_exp == 0;
804    if special {
805        if bin_sig == Float::SigType::from(0) {
806            return unsafe {
807                *buffer = b'0';
808                *buffer.add(1) = b'.';
809                *buffer.add(2) = b'0';
810                buffer.add(3)
811            };
812        }
813        bin_exp = 1 - Float::NUM_SIG_BITS - Float::EXP_BIAS;
814        bin_sig |= Float::IMPLICIT_BIT;
815        // Setting regular is not redundant: it has a measurable perf impact.
816        regular = true;
817    }
818    bin_sig ^= Float::IMPLICIT_BIT;
819
820    // Here be 🐉s.
821    let mut dec = to_decimal(bin_sig, bin_exp, dec_exp, regular, special);
822    dec_exp = dec.exp;
823
824    // Write significand.
825    let end = if Float::NUM_BITS == 64 {
826        dec_exp += Float::MAX_DIGITS10 as i32 + i32::from(dec.sig >= 10_000_000_000_000_000) - 2;
827        unsafe { write_significand17(buffer.add(1), dec.sig as u64) }
828    } else {
829        if dec.sig < 10_000_000 {
830            dec.sig *= 10;
831            dec_exp -= 1;
832        }
833        dec_exp += Float::MAX_DIGITS10 as i32 + i32::from(dec.sig >= 100_000_000) - 2;
834        unsafe { write_significand9(buffer.add(1), dec.sig as u32) }
835    };
836
837    let length = unsafe { end.offset_from(buffer.add(1)) } as usize;
838
839    if Float::NUM_BITS == 32 && (-6..=12).contains(&dec_exp)
840        || Float::NUM_BITS == 64 && (-5..=15).contains(&dec_exp)
841    {
842        if length as i32 - 1 <= dec_exp {
843            // 1234e7 -> 12340000000.0
844            return unsafe {
845                ptr::copy(buffer.add(1), buffer, length);
846                ptr::write_bytes(buffer.add(length), b'0', dec_exp as usize + 3 - length);
847                *buffer.add(dec_exp as usize + 1) = b'.';
848                buffer.add(dec_exp as usize + 3)
849            };
850        } else if 0 <= dec_exp {
851            // 1234e-2 -> 12.34
852            return unsafe {
853                ptr::copy(buffer.add(1), buffer, dec_exp as usize + 1);
854                *buffer.add(dec_exp as usize + 1) = b'.';
855                buffer.add(length + 1)
856            };
857        } else {
858            // 1234e-6 -> 0.001234
859            return unsafe {
860                ptr::copy(buffer.add(1), buffer.add((1 - dec_exp) as usize), length);
861                ptr::write_bytes(buffer, b'0', (1 - dec_exp) as usize);
862                *buffer.add(1) = b'.';
863                buffer.add((1 - dec_exp) as usize + length)
864            };
865        }
866    }
867
868    unsafe {
869        // 1234e30 -> 1.234e33
870        *buffer = *buffer.add(1);
871        *buffer.add(1) = b'.';
872    }
873    buffer = unsafe { buffer.add(length + usize::from(length > 1)) };
874
875    // Write exponent.
876    let sign_ptr = buffer;
877    let e_sign = if dec_exp >= 0 {
878        (u16::from(b'+') << 8) | u16::from(b'e')
879    } else {
880        (u16::from(b'-') << 8) | u16::from(b'e')
881    };
882    buffer = unsafe { buffer.add(1) };
883    let mask = i32::from(dec_exp >= 0) - 1;
884    dec_exp = (dec_exp + mask) ^ mask; // absolute value
885    buffer = unsafe { buffer.add(usize::from(dec_exp >= 10)) };
886    if Float::MIN_10_EXP >= -99 && Float::MAX_10_EXP <= 99 {
887        unsafe {
888            buffer
889                .cast::<u16>()
890                .write_unaligned(*digits2(dec_exp as usize));
891            sign_ptr.cast::<u16>().write_unaligned(e_sign.to_le());
892            return buffer.add(2);
893        }
894    }
895    // 19 is faster or equal to 12 even for 3 digits.
896    const DIV_EXP: u32 = 19;
897    const DIV_SIG: u32 = (1 << DIV_EXP) / 100 + 1;
898    let digit = (dec_exp as u32 * DIV_SIG) >> DIV_EXP; // value / 100
899    unsafe {
900        *buffer = b'0' + digit as u8;
901    }
902    buffer = unsafe { buffer.add(usize::from(dec_exp >= 100)) };
903    unsafe {
904        buffer
905            .cast::<u16>()
906            .write_unaligned(*digits2((dec_exp as u32 - digit * 100) as usize));
907        sign_ptr.cast::<u16>().write_unaligned(e_sign.to_le());
908        buffer.add(2)
909    }
910}
911
912/// Safe API for formatting floating point numbers to text.
913///
914/// ## Example
915///
916/// ```
917/// let mut buffer = zmij::Buffer::new();
918/// let printed = buffer.format_finite(1.234);
919/// assert_eq!(printed, "1.234");
920/// ```
921pub struct Buffer {
922    bytes: [MaybeUninit<u8>; BUFFER_SIZE],
923}
924
925impl Buffer {
926    /// This is a cheap operation; you don't need to worry about reusing buffers
927    /// for efficiency.
928    #[inline]
929    #[cfg_attr(feature = "no-panic", no_panic)]
930    pub fn new() -> Self {
931        let bytes = [MaybeUninit::<u8>::uninit(); BUFFER_SIZE];
932        Buffer { bytes }
933    }
934
935    /// Print a floating point number into this buffer and return a reference to
936    /// its string representation within the buffer.
937    ///
938    /// # Special cases
939    ///
940    /// This function formats NaN as the string "NaN", positive infinity as
941    /// "inf", and negative infinity as "-inf" to match std::fmt.
942    ///
943    /// If your input is known to be finite, you may get better performance by
944    /// calling the `format_finite` method instead of `format` to avoid the
945    /// checks for special cases.
946    #[cfg_attr(feature = "no-panic", no_panic)]
947    pub fn format<F: Float>(&mut self, f: F) -> &str {
948        if f.is_nonfinite() {
949            f.format_nonfinite()
950        } else {
951            self.format_finite(f)
952        }
953    }
954
955    /// Print a floating point number into this buffer and return a reference to
956    /// its string representation within the buffer.
957    ///
958    /// # Special cases
959    ///
960    /// This function **does not** check for NaN or infinity. If the input
961    /// number is not a finite float, the printed representation will be some
962    /// correctly formatted but unspecified numerical value.
963    ///
964    /// Please check [`is_finite`] yourself before calling this function, or
965    /// check [`is_nan`] and [`is_infinite`] and handle those cases yourself.
966    ///
967    /// [`is_finite`]: f64::is_finite
968    /// [`is_nan`]: f64::is_nan
969    /// [`is_infinite`]: f64::is_infinite
970    #[cfg_attr(feature = "no-panic", no_panic)]
971    pub fn format_finite<F: Float>(&mut self, f: F) -> &str {
972        unsafe {
973            let end = f.write_to_zmij_buffer(self.bytes.as_mut_ptr().cast::<u8>());
974            let len = end.offset_from(self.bytes.as_ptr().cast::<u8>()) as usize;
975            let slice = slice::from_raw_parts(self.bytes.as_ptr().cast::<u8>(), len);
976            str::from_utf8_unchecked(slice)
977        }
978    }
979}
980
981/// A floating point number, f32 or f64, that can be written into a
982/// [`zmij::Buffer`][Buffer].
983///
984/// This trait is sealed and cannot be implemented for types outside of the
985/// `zmij` crate.
986#[allow(unknown_lints)] // rustc older than 1.74
987#[allow(private_bounds)]
988pub trait Float: private::Sealed {}
989impl Float for f32 {}
990impl Float for f64 {}
991
992mod private {
993    pub trait Sealed: crate::traits::Float {
994        fn is_nonfinite(self) -> bool;
995        fn format_nonfinite(self) -> &'static str;
996        unsafe fn write_to_zmij_buffer(self, buffer: *mut u8) -> *mut u8;
997    }
998
999    impl Sealed for f32 {
1000        #[inline]
1001        fn is_nonfinite(self) -> bool {
1002            const EXP_MASK: u32 = 0x7f800000;
1003            let bits = self.to_bits();
1004            bits & EXP_MASK == EXP_MASK
1005        }
1006
1007        #[cold]
1008        #[cfg_attr(feature = "no-panic", inline)]
1009        fn format_nonfinite(self) -> &'static str {
1010            const MANTISSA_MASK: u32 = 0x007fffff;
1011            const SIGN_MASK: u32 = 0x80000000;
1012            let bits = self.to_bits();
1013            if bits & MANTISSA_MASK != 0 {
1014                crate::NAN
1015            } else if bits & SIGN_MASK != 0 {
1016                crate::NEG_INFINITY
1017            } else {
1018                crate::INFINITY
1019            }
1020        }
1021
1022        #[cfg_attr(feature = "no-panic", inline)]
1023        unsafe fn write_to_zmij_buffer(self, buffer: *mut u8) -> *mut u8 {
1024            unsafe { crate::write(self, buffer) }
1025        }
1026    }
1027
1028    impl Sealed for f64 {
1029        #[inline]
1030        fn is_nonfinite(self) -> bool {
1031            const EXP_MASK: u64 = 0x7ff0000000000000;
1032            let bits = self.to_bits();
1033            bits & EXP_MASK == EXP_MASK
1034        }
1035
1036        #[cold]
1037        #[cfg_attr(feature = "no-panic", inline)]
1038        fn format_nonfinite(self) -> &'static str {
1039            const MANTISSA_MASK: u64 = 0x000fffffffffffff;
1040            const SIGN_MASK: u64 = 0x8000000000000000;
1041            let bits = self.to_bits();
1042            if bits & MANTISSA_MASK != 0 {
1043                crate::NAN
1044            } else if bits & SIGN_MASK != 0 {
1045                crate::NEG_INFINITY
1046            } else {
1047                crate::INFINITY
1048            }
1049        }
1050
1051        #[cfg_attr(feature = "no-panic", inline)]
1052        unsafe fn write_to_zmij_buffer(self, buffer: *mut u8) -> *mut u8 {
1053            unsafe { crate::write(self, buffer) }
1054        }
1055    }
1056}
1057
1058impl Default for Buffer {
1059    #[inline]
1060    #[cfg_attr(feature = "no-panic", no_panic)]
1061    fn default() -> Self {
1062        Buffer::new()
1063    }
1064}