zmij/
lib.rs

1//! [![github]](https://github.com/dtolnay/zmij) [![crates-io]](https://crates.io/crates/zmij) [![docs-rs]](https://docs.rs/zmij)
2//!
3//! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github
4//! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust
5//! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs
6//!
7//! <br>
8//!
9//! A double-to-string conversion algorithm based on [Schubfach] and [yy].
10//!
11//! This Rust implementation is a line-by-line port of Victor Zverovich's
12//! implementation in C++, <https://github.com/vitaut/zmij>.
13//!
14//! [Schubfach]: https://fmt.dev/papers/Schubfach4.pdf
15//! [yy]: https://github.com/ibireme/c_numconv_benchmark/blob/master/vendor/yy_double/yy_double.c
16//!
17//! <br>
18//!
19//! # Example
20//!
21//! ```
22//! fn main() {
23//!     let mut buffer = zmij::Buffer::new();
24//!     let printed = buffer.format(1.234);
25//!     assert_eq!(printed, "1.234");
26//! }
27//! ```
28//!
29//! <br>
30//!
31//! ## Performance
32//!
33//! The [dtoa-benchmark] compares this library and other Rust floating point
34//! formatting implementations across a range of precisions. The vertical axis
35//! in this chart shows nanoseconds taken by a single execution of
36//! `zmij::Buffer::new().format_finite(value)` so a lower result indicates a
37//! faster library.
38//!
39//! [dtoa-benchmark]: https://github.com/dtolnay/dtoa-benchmark
40//!
41//! ![performance](https://raw.githubusercontent.com/dtolnay/zmij/master/dtoa-benchmark.png)
42
43#![no_std]
44#![doc(html_root_url = "https://docs.rs/zmij/1.0.16")]
45#![deny(unsafe_op_in_unsafe_fn)]
46#![allow(non_camel_case_types, non_snake_case)]
47#![allow(
48    clippy::blocks_in_conditions,
49    clippy::cast_possible_truncation,
50    clippy::cast_possible_wrap,
51    clippy::cast_ptr_alignment,
52    clippy::cast_sign_loss,
53    clippy::doc_markdown,
54    clippy::incompatible_msrv,
55    clippy::items_after_statements,
56    clippy::many_single_char_names,
57    clippy::must_use_candidate,
58    clippy::needless_doctest_main,
59    clippy::never_loop,
60    clippy::redundant_else,
61    clippy::similar_names,
62    clippy::too_many_arguments,
63    clippy::too_many_lines,
64    clippy::unreadable_literal,
65    clippy::used_underscore_items,
66    clippy::while_immutable_condition,
67    clippy::wildcard_imports
68)]
69
70#[cfg(zmij_no_select_unpredictable)]
71mod hint;
72#[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(miri)))]
73mod stdarch_x86;
74#[cfg(test)]
75mod tests;
76mod traits;
77
78#[cfg(all(any(target_arch = "aarch64", target_arch = "x86_64"), not(miri)))]
79use core::arch::asm;
80#[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(miri)))]
81use core::arch::x86_64::__m128i;
82#[cfg(not(zmij_no_select_unpredictable))]
83use core::hint;
84use core::mem::{self, MaybeUninit};
85use core::ptr;
86use core::slice;
87use core::str;
88#[cfg(feature = "no-panic")]
89use no_panic::no_panic;
90
91const BUFFER_SIZE: usize = 24;
92const NAN: &str = "NaN";
93const INFINITY: &str = "inf";
94const NEG_INFINITY: &str = "-inf";
95
96#[cfg_attr(test, derive(Debug, PartialEq))]
97struct uint128 {
98    hi: u64,
99    lo: u64,
100}
101
102// Use umul128_hi64 for division.
103const USE_UMUL128_HI64: bool = falsecfg!(target_vendor = "apple");
104
105// Computes 128-bit result of multiplication of two 64-bit unsigned integers.
106const fn umul128(x: u64, y: u64) -> u128 {
107    x as u128 * y as u128
108}
109
110const fn umul128_hi64(x: u64, y: u64) -> u64 {
111    (umul128(x, y) >> 64) as u64
112}
113
114#[cfg_attr(feature = "no-panic", no_panic)]
115fn umul192_hi128(x_hi: u64, x_lo: u64, y: u64) -> uint128 {
116    let p = umul128(x_hi, y);
117    let lo = (p as u64).wrapping_add((umul128(x_lo, y) >> 64) as u64);
118    uint128 {
119        hi: (p >> 64) as u64 + u64::from(lo < p as u64),
120        lo,
121    }
122}
123
124// Computes high 64 bits of multiplication of x and y, discards the least
125// significant bit and rounds to odd, where x = uint128_t(x_hi << 64) | x_lo.
126#[cfg_attr(feature = "no-panic", no_panic)]
127fn umulhi_inexact_to_odd<UInt>(x_hi: u64, x_lo: u64, y: UInt) -> UInt
128where
129    UInt: traits::UInt,
130{
131    let num_bits = mem::size_of::<UInt>() * 8;
132    if num_bits == 64 {
133        let p = umul192_hi128(x_hi, x_lo, y.into());
134        UInt::truncate(p.hi | u64::from((p.lo >> 1) != 0))
135    } else {
136        let p = (umul128(x_hi, y.into()) >> 32) as u64;
137        UInt::enlarge((p >> 32) as u32 | u32::from((p as u32 >> 1) != 0))
138    }
139}
140
141trait FloatTraits: traits::Float {
142    const NUM_BITS: i32;
143    const NUM_SIG_BITS: i32 = Self::MANTISSA_DIGITS as i32 - 1;
144    const NUM_EXP_BITS: i32 = Self::NUM_BITS - Self::NUM_SIG_BITS - 1;
145    const EXP_MASK: i32 = (1 << Self::NUM_EXP_BITS) - 1;
146    const EXP_BIAS: i32 = (1 << (Self::NUM_EXP_BITS - 1)) - 1;
147    const EXP_OFFSET: i32 = Self::EXP_BIAS + Self::NUM_SIG_BITS;
148
149    type SigType: traits::UInt;
150    const IMPLICIT_BIT: Self::SigType;
151
152    fn to_bits(self) -> Self::SigType;
153
154    fn is_negative(bits: Self::SigType) -> bool {
155        (bits >> (Self::NUM_BITS - 1)) != Self::SigType::from(0)
156    }
157
158    fn get_sig(bits: Self::SigType) -> Self::SigType {
159        bits & (Self::IMPLICIT_BIT - Self::SigType::from(1))
160    }
161
162    fn get_exp(bits: Self::SigType) -> i64 {
163        (bits << 1u8 >> (Self::NUM_SIG_BITS + 1)).into() as i64
164    }
165}
166
167impl FloatTraits for f32 {
168    const NUM_BITS: i32 = 32;
169    const IMPLICIT_BIT: u32 = 1 << Self::NUM_SIG_BITS;
170
171    type SigType = u32;
172
173    fn to_bits(self) -> Self::SigType {
174        self.to_bits()
175    }
176}
177
178impl FloatTraits for f64 {
179    const NUM_BITS: i32 = 64;
180    const IMPLICIT_BIT: u64 = 1 << Self::NUM_SIG_BITS;
181
182    type SigType = u64;
183
184    fn to_bits(self) -> Self::SigType {
185        self.to_bits()
186    }
187}
188
189#[repr(C, align(64))]
190struct Pow10SignificandsTable {
191    data: [u64; Self::NUM_POW10 * 2],
192}
193
194impl Pow10SignificandsTable {
195    const SPLIT_TABLES: bool = falsecfg!(target_arch = "aarch64");
196    const NUM_POW10: usize = 617;
197
198    unsafe fn get_unchecked(&self, dec_exp: i32) -> uint128 {
199        const DEC_EXP_MIN: i32 = -292;
200        if !Self::SPLIT_TABLES {
201            let index = ((dec_exp - DEC_EXP_MIN) * 2) as usize;
202            return uint128 {
203                hi: unsafe { *self.data.get_unchecked(index) },
204                lo: unsafe { *self.data.get_unchecked(index + 1) },
205            };
206        }
207
208        unsafe {
209            #[cfg_attr(
210                not(all(any(target_arch = "x86_64", target_arch = "aarch64"), not(miri))),
211                allow(unused_mut)
212            )]
213            let mut hi = self
214                .data
215                .as_ptr()
216                .offset(Self::NUM_POW10 as isize + DEC_EXP_MIN as isize - 1);
217            #[cfg_attr(
218                not(all(any(target_arch = "x86_64", target_arch = "aarch64"), not(miri))),
219                allow(unused_mut)
220            )]
221            let mut lo = hi.add(Self::NUM_POW10);
222
223            // Force indexed loads.
224            #[cfg(all(any(target_arch = "x86_64", target_arch = "aarch64"), not(miri)))]
225            asm!("/*{0}{1}*/", inout(reg) hi, inout(reg) lo);
226            uint128 {
227                hi: *hi.offset(-dec_exp as isize),
228                lo: *lo.offset(-dec_exp as isize),
229            }
230        }
231    }
232
233    #[cfg(test)]
234    fn get(&self, dec_exp: i32) -> uint128 {
235        const DEC_EXP_MIN: i32 = -292;
236        assert!((DEC_EXP_MIN..DEC_EXP_MIN + Self::NUM_POW10 as i32).contains(&dec_exp));
237        unsafe { self.get_unchecked(dec_exp) }
238    }
239}
240
241// 128-bit significands of powers of 10 rounded down.
242// Generated using 192-bit arithmetic method by Dougall Johnson.
243static POW10_SIGNIFICANDS: Pow10SignificandsTable = {
244    let mut data = [0; Pow10SignificandsTable::NUM_POW10 * 2];
245
246    struct uint192 {
247        w0: u64, // least significant
248        w1: u64,
249        w2: u64, // most significant
250    }
251
252    // First element, rounded up to cancel out rounding down in the
253    // multiplication, and minimize significant bits.
254    let mut current = uint192 {
255        w0: 0xe000000000000000,
256        w1: 0x25e8e89c13bb0f7a,
257        w2: 0xff77b1fcbebcdc4f,
258    };
259    let ten = 0xa000000000000000;
260    let mut i = 0;
261    while i < Pow10SignificandsTable::NUM_POW10 {
262        if Pow10SignificandsTable::SPLIT_TABLES {
263            data[Pow10SignificandsTable::NUM_POW10 - i - 1] = current.w2;
264            data[Pow10SignificandsTable::NUM_POW10 * 2 - i - 1] = current.w1;
265        } else {
266            data[i * 2] = current.w2;
267            data[i * 2 + 1] = current.w1;
268        }
269
270        let h0: u64 = umul128_hi64(current.w0, ten);
271        let h1: u64 = umul128_hi64(current.w1, ten);
272
273        let c0: u64 = h0.wrapping_add(current.w1.wrapping_mul(ten));
274        let c1: u64 = ((c0 < h0) as u64 + h1).wrapping_add(current.w2.wrapping_mul(ten));
275        let c2: u64 = (c1 < h1) as u64 + umul128_hi64(current.w2, ten); // dodgy carry
276
277        // normalise
278        if (c2 >> 63) != 0 {
279            current = uint192 {
280                w0: c0,
281                w1: c1,
282                w2: c2,
283            };
284        } else {
285            current = uint192 {
286                w0: c0 << 1,
287                w1: c1 << 1 | c0 >> 63,
288                w2: c2 << 1 | c1 >> 63,
289            };
290        }
291
292        i += 1;
293    }
294
295    Pow10SignificandsTable { data }
296};
297
298// Computes the decimal exponent as floor(log10(2**bin_exp)) if regular or
299// floor(log10(3/4 * 2**bin_exp)) otherwise, without branching.
300const fn compute_dec_exp(bin_exp: i32, regular: bool) -> i32 {
301    if true {
    if !(bin_exp >= -1334 && bin_exp <= 2620) {
        ::core::panicking::panic("assertion failed: bin_exp >= -1334 && bin_exp <= 2620")
    };
};debug_assert!(bin_exp >= -1334 && bin_exp <= 2620);
302    // log10_3_over_4_sig = -log10(3/4) * 2**log10_2_exp rounded to a power of 2
303    const LOG10_3_OVER_4_SIG: i32 = 131_072;
304    // log10_2_sig = round(log10(2) * 2**log10_2_exp)
305    const LOG10_2_SIG: i32 = 315_653;
306    const LOG10_2_EXP: i32 = 20;
307    (bin_exp * LOG10_2_SIG - !regular as i32 * LOG10_3_OVER_4_SIG) >> LOG10_2_EXP
308}
309
310const fn do_compute_exp_shift(bin_exp: i32, dec_exp: i32) -> u8 {
311    if true {
    if !(dec_exp >= -350 && dec_exp <= 350) {
        ::core::panicking::panic("assertion failed: dec_exp >= -350 && dec_exp <= 350")
    };
};debug_assert!(dec_exp >= -350 && dec_exp <= 350);
312    // log2_pow10_sig = round(log2(10) * 2**log2_pow10_exp) + 1
313    const LOG2_POW10_SIG: i32 = 217_707;
314    const LOG2_POW10_EXP: i32 = 16;
315    // pow10_bin_exp = floor(log2(10**-dec_exp))
316    let pow10_bin_exp = (-dec_exp * LOG2_POW10_SIG) >> LOG2_POW10_EXP;
317    // pow10 = ((pow10_hi << 64) | pow10_lo) * 2**(pow10_bin_exp - 127)
318    (bin_exp + pow10_bin_exp + 1) as u8
319}
320
321struct ExpShiftTable {
322    data: [u8; if Self::ENABLE {
323        Self::NUM_EXPS as usize
324    } else {
325        1
326    }],
327}
328
329impl ExpShiftTable {
330    const ENABLE: bool = true;
331    const NUM_EXPS: i32 = f64::EXP_MASK + 1;
332}
333
334static EXP_SHIFTS: ExpShiftTable = {
335    let mut data = [0u8; if ExpShiftTable::ENABLE {
336        ExpShiftTable::NUM_EXPS as usize
337    } else {
338        1
339    }];
340
341    if ExpShiftTable::ENABLE {
342        let mut raw_exp = 0;
343        while raw_exp < ExpShiftTable::NUM_EXPS {
344            let mut bin_exp = raw_exp - f64::EXP_OFFSET;
345            if raw_exp == 0 {
346                bin_exp += 1;
347            }
348            let dec_exp = compute_dec_exp(bin_exp, true);
349            data[raw_exp as usize] = do_compute_exp_shift(bin_exp, dec_exp) as u8;
350            raw_exp += 1;
351        }
352    }
353
354    ExpShiftTable { data }
355};
356
357// Computes a shift so that, after scaling by a power of 10, the intermediate
358// result always has a fixed 128-bit fractional part (for double).
359//
360// Different binary exponents can map to the same decimal exponent, but place
361// the decimal point at different bit positions. The shift compensates for this.
362//
363// For example, both 3 * 2**59 and 3 * 2**60 have dec_exp = 2, but dividing by
364// 10^dec_exp puts the decimal point in different bit positions:
365//   3 * 2**59 / 100 = 1.72...e+16  (needs shift = 1 + 1)
366//   3 * 2**60 / 100 = 3.45...e+16  (needs shift = 2 + 1)
367unsafe fn compute_exp_shift<UInt, const ONLY_REGULAR: bool>(bin_exp: i32, dec_exp: i32) -> u8
368where
369    UInt: traits::UInt,
370{
371    let num_bits = mem::size_of::<UInt>() * 8;
372    if num_bits == 64 && ExpShiftTable::ENABLE && ONLY_REGULAR {
373        unsafe {
374            *EXP_SHIFTS
375                .data
376                .as_ptr()
377                .add((bin_exp + f64::EXP_OFFSET) as usize)
378        }
379    } else {
380        do_compute_exp_shift(bin_exp, dec_exp)
381    }
382}
383
384#[cfg_attr(feature = "no-panic", no_panic)]
385fn count_trailing_nonzeros(x: u64) -> usize {
386    // We count the number of bytes until there are only zeros left.
387    // The code is equivalent to
388    //    8 - x.leading_zeros() / 8
389    // but if the BSR instruction is emitted (as gcc on x64 does with default
390    // settings), subtracting the constant before dividing allows the compiler
391    // to combine it with the subtraction which it inserts due to BSR counting
392    // in the opposite direction.
393    //
394    // Additionally, the BSR instruction requires a zero check. Since the high
395    // bit is unused we can avoid the zero check by shifting the datum left by
396    // one and inserting a sentinel bit at the end. This can be faster than the
397    // automatically inserted range check.
398    (70 - ((x.to_le() << 1) | 1).leading_zeros() as usize) / 8
399}
400
401// Align data since unaligned access may be slower when crossing a
402// hardware-specific boundary.
403#[repr(C, align(2))]
404struct Digits2([u8; 200]);
405
406static DIGITS2: Digits2 = Digits2(
407    *b"0001020304050607080910111213141516171819\
408       2021222324252627282930313233343536373839\
409       4041424344454647484950515253545556575859\
410       6061626364656667686970717273747576777879\
411       8081828384858687888990919293949596979899",
412);
413
414// Converts value in the range [0, 100) to a string. GCC generates a bit better
415// code when value is pointer-size (https://www.godbolt.org/z/5fEPMT1cc).
416#[cfg_attr(feature = "no-panic", no_panic)]
417unsafe fn digits2(value: usize) -> &'static u16 {
418    if true {
    if !(value < 100) {
        ::core::panicking::panic("assertion failed: value < 100")
    };
};debug_assert!(value < 100);
419
420    #[allow(clippy::cast_ptr_alignment)]
421    unsafe {
422        &*DIGITS2.0.as_ptr().cast::<u16>().add(value)
423    }
424}
425
426const DIV10K_EXP: i32 = 40;
427const DIV10K_SIG: u32 = ((1u64 << DIV10K_EXP) / 10000 + 1) as u32;
428const NEG10K: u32 = ((1u64 << 32) - 10000) as u32;
429const DIV100_EXP: i32 = 19;
430const DIV100_SIG: u32 = (1 << DIV100_EXP) / 100 + 1;
431const NEG100: u32 = (1 << 16) - 100;
432const DIV10_EXP: i32 = 10;
433const DIV10_SIG: u32 = (1 << DIV10_EXP) / 10 + 1;
434const NEG10: u32 = (1 << 8) - 10;
435
436const ZEROS: u64 = 0x0101010101010101 * b'0' as u64;
437
438#[cfg_attr(feature = "no-panic", no_panic)]
439fn to_bcd8(abcdefgh: u64) -> u64 {
440    // An optimization from Xiang JunBo.
441    // Three steps BCD. Base 10000 -> base 100 -> base 10.
442    // div and mod are evaluated simultaneously as, e.g.
443    //   (abcdefgh / 10000) << 32 + (abcdefgh % 10000)
444    //      == abcdefgh + (2**32 - 10000) * (abcdefgh / 10000)))
445    // where the division on the RHS is implemented by the usual multiply + shift
446    // trick and the fractional bits are masked away.
447    let abcd_efgh =
448        abcdefgh + u64::from(NEG10K) * ((abcdefgh * u64::from(DIV10K_SIG)) >> DIV10K_EXP);
449    let ab_cd_ef_gh = abcd_efgh
450        + u64::from(NEG100) * (((abcd_efgh * u64::from(DIV100_SIG)) >> DIV100_EXP) & 0x7f0000007f);
451    let a_b_c_d_e_f_g_h = ab_cd_ef_gh
452        + u64::from(NEG10)
453            * (((ab_cd_ef_gh * u64::from(DIV10_SIG)) >> DIV10_EXP) & 0xf000f000f000f);
454    a_b_c_d_e_f_g_h.to_be()
455}
456
457unsafe fn write_if(buffer: *mut u8, digit: u32, condition: bool) -> *mut u8 {
458    unsafe {
459        *buffer = b'0' + digit as u8;
460        buffer.add(usize::from(condition))
461    }
462}
463
464unsafe fn write8(buffer: *mut u8, value: u64) {
465    unsafe {
466        buffer.cast::<u64>().write_unaligned(value);
467    }
468}
469
470#[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(miri)))]
471const fn splat64(x: u64) -> __m128i {
472    unsafe { mem::transmute::<[u64; 2], __m128i>([x, x]) }
473}
474
475#[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(miri)))]
476const fn splat32(x: u32) -> __m128i {
477    splat64(((x as u64) << 32) | x as u64)
478}
479
480#[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(miri)))]
481const fn splat16(x: u16) -> __m128i {
482    splat32(((x as u32) << 16) | x as u32)
483}
484
485#[cfg(all(target_arch = "x86_64", target_feature = "sse4.1", not(miri)))]
486const fn pack8(a: u8, b: u8, c: u8, d: u8, e: u8, f: u8, g: u8, h: u8) -> u64 {
487    ((h as u64) << 56)
488        | ((g as u64) << 48)
489        | ((f as u64) << 40)
490        | ((e as u64) << 32)
491        | ((d as u64) << 24)
492        | ((c as u64) << 16)
493        | ((b as u64) << 8)
494        | a as u64
495}
496
497// Writes a significand consisting of up to 17 decimal digits (16-17 for
498// normals) and removes trailing zeros. The significant digits start from
499// buffer[1]. buffer[0] may contain '0' after this function if the significand
500// has length 16.
501#[cfg_attr(feature = "no-panic", no_panic)]
502unsafe fn write_significand17(
503    mut buffer: *mut u8,
504    value: u64,
505    has17digits: bool,
506    #[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(miri)))] value_div10: i64,
507) -> *mut u8 {
508    #[cfg(not(any(
509        all(target_arch = "aarch64", target_feature = "neon", not(miri)),
510        all(target_arch = "x86_64", target_feature = "sse2", not(miri)),
511    )))]
512    {
513        let start = unsafe { buffer.add(1) };
514        // Each digits is denoted by a letter so value is abbccddeeffgghhii.
515        let abbccddee = (value / 100_000_000) as u32;
516        let ffgghhii = (value % 100_000_000) as u32;
517        buffer = unsafe { write_if(start, abbccddee / 100_000_000, has17digits) };
518        let bcd = to_bcd8(u64::from(abbccddee % 100_000_000));
519        unsafe {
520            write8(buffer, bcd | ZEROS);
521        }
522        if ffgghhii == 0 {
523            return unsafe { buffer.add(count_trailing_nonzeros(bcd)) };
524        }
525        let bcd = to_bcd8(u64::from(ffgghhii));
526        unsafe {
527            write8(buffer.add(8), bcd | ZEROS);
528            buffer.add(8).add(count_trailing_nonzeros(bcd))
529        }
530    }
531
532    #[cfg(all(target_arch = "aarch64", target_feature = "neon", not(miri)))]
533    {
534        // An optimized version for NEON by Dougall Johnson.
535
536        use core::arch::aarch64::*;
537
538        const NEG10K: i32 = -10000 + 0x10000;
539
540        struct MulConstants {
541            mul_const: u64,
542            hundred_million: u64,
543            multipliers32: int32x4_t,
544            multipliers16: int16x8_t,
545        }
546
547        static CONSTANTS: MulConstants = MulConstants {
548            mul_const: 0xabcc77118461cefd,
549            hundred_million: 100000000,
550            multipliers32: unsafe {
551                mem::transmute::<[i32; 4], int32x4_t>([
552                    DIV10K_SIG as i32,
553                    NEG10K,
554                    (DIV100_SIG << 12) as i32,
555                    NEG100 as i32,
556                ])
557            },
558            multipliers16: unsafe {
559                mem::transmute::<[i16; 8], int16x8_t>([0xce0, NEG10 as i16, 0, 0, 0, 0, 0, 0])
560            },
561        };
562
563        let mut c = ptr::addr_of!(CONSTANTS);
564
565        // Compiler barrier, or clang doesn't load from memory and generates 15
566        // more instructions
567        let c = unsafe {
568            asm!("/*{0}*/", inout(reg) c);
569            &*c
570        };
571
572        let mut hundred_million = c.hundred_million;
573
574        // Compiler barrier, or clang narrows the load to 32-bit and unpairs it.
575        unsafe {
576            asm!("/*{0}*/", inout(reg) hundred_million);
577        }
578
579        // Equivalent to abbccddee = value / 100000000, ffgghhii = value % 100000000.
580        let abbccddee = (umul128(value, c.mul_const) >> 90) as u64;
581        let ffgghhii = value - abbccddee * hundred_million;
582
583        // We could probably make this bit faster, but we're preferring to
584        // reuse the constants for now.
585        let a = (umul128(abbccddee, c.mul_const) >> 90) as u64;
586        let bbccddee = abbccddee - a * hundred_million;
587
588        let start = unsafe { buffer.add(1) };
589        buffer = unsafe { write_if(start, a as u32, has17digits) };
590
591        unsafe {
592            let ffgghhii_bbccddee_64: uint64x1_t =
593                mem::transmute::<u64, uint64x1_t>((ffgghhii << 32) | bbccddee);
594            let bbccddee_ffgghhii: int32x2_t = vreinterpret_s32_u64(ffgghhii_bbccddee_64);
595
596            let bbcc_ffgg: int32x2_t = vreinterpret_s32_u32(vshr_n_u32(
597                vreinterpret_u32_s32(vqdmulh_n_s32(
598                    bbccddee_ffgghhii,
599                    mem::transmute::<int32x4_t, [i32; 4]>(c.multipliers32)[0],
600                )),
601                9,
602            ));
603            let ddee_bbcc_hhii_ffgg_32: int32x2_t = vmla_n_s32(
604                bbccddee_ffgghhii,
605                bbcc_ffgg,
606                mem::transmute::<int32x4_t, [i32; 4]>(c.multipliers32)[1],
607            );
608
609            let mut ddee_bbcc_hhii_ffgg: int32x4_t =
610                vreinterpretq_s32_u32(vshll_n_u16(vreinterpret_u16_s32(ddee_bbcc_hhii_ffgg_32), 0));
611
612            // Compiler barrier, or clang breaks the subsequent MLA into UADDW +
613            // MUL.
614            asm!("/*{:v}*/", inout(vreg) ddee_bbcc_hhii_ffgg);
615
616            let dd_bb_hh_ff: int32x4_t = vqdmulhq_n_s32(
617                ddee_bbcc_hhii_ffgg,
618                mem::transmute::<int32x4_t, [i32; 4]>(c.multipliers32)[2],
619            );
620            let ee_dd_cc_bb_ii_hh_gg_ff: int16x8_t = vreinterpretq_s16_s32(vmlaq_n_s32(
621                ddee_bbcc_hhii_ffgg,
622                dd_bb_hh_ff,
623                mem::transmute::<int32x4_t, [i32; 4]>(c.multipliers32)[3],
624            ));
625            let high_10s: int16x8_t = vqdmulhq_n_s16(
626                ee_dd_cc_bb_ii_hh_gg_ff,
627                mem::transmute::<int16x8_t, [i16; 8]>(c.multipliers16)[0],
628            );
629            let digits: uint8x16_t = vrev64q_u8(vreinterpretq_u8_s16(vmlaq_n_s16(
630                ee_dd_cc_bb_ii_hh_gg_ff,
631                high_10s,
632                mem::transmute::<int16x8_t, [i16; 8]>(c.multipliers16)[1],
633            )));
634            let str: uint16x8_t = vaddq_u16(
635                vreinterpretq_u16_u8(digits),
636                vreinterpretq_u16_s8(vdupq_n_s8(b'0' as i8)),
637            );
638
639            buffer.cast::<uint16x8_t>().write_unaligned(str);
640
641            let is_not_zero: uint16x8_t =
642                vreinterpretq_u16_u8(vcgtzq_s8(mem::transmute::<uint8x16_t, int8x16_t>(digits)));
643            let zeros: u64 = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(is_not_zero, 4)), 0);
644
645            buffer.add(16 - (zeros.leading_zeros() as usize >> 2))
646        }
647    }
648
649    #[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(miri)))]
650    {
651        use crate::stdarch_x86::*;
652
653        let last_digit = (value - value_div10 as u64 * 10) as u32;
654
655        // We always write 17 digits into the buffer, but the first one can be
656        // zero. buffer points to the second place in the output buffer to allow
657        // for the insertion of the decimal point, so we can use the first place
658        // as scratch.
659        buffer = unsafe { buffer.add(usize::from(has17digits)) };
660        unsafe {
661            *buffer.add(16) = last_digit as u8 + b'0';
662        }
663
664        let abcdefgh = (value_div10 / 100_000_000) as u32;
665        let ijklmnop = (value_div10 % 100_000_000) as u32;
666
667        #[repr(C, align(64))]
668        struct Consts {
669            div10k: __m128i,
670            neg10k: __m128i,
671            div100: __m128i,
672            div10: __m128i,
673            #[cfg(target_feature = "sse4.1")]
674            neg100: __m128i,
675            #[cfg(target_feature = "sse4.1")]
676            neg10: __m128i,
677            #[cfg(target_feature = "sse4.1")]
678            bswap: __m128i,
679            #[cfg(not(target_feature = "sse4.1"))]
680            hundred: __m128i,
681            #[cfg(not(target_feature = "sse4.1"))]
682            moddiv10: __m128i,
683            zeros: __m128i,
684        }
685
686        static CONSTS: Consts = Consts {
687            div10k: splat64(DIV10K_SIG as u64),
688            neg10k: splat64(NEG10K as u64),
689            div100: splat32(DIV100_SIG),
690            div10: splat16(((1u32 << 16) / 10 + 1) as u16),
691            #[cfg(target_feature = "sse4.1")]
692            neg100: splat32(NEG100),
693            #[cfg(target_feature = "sse4.1")]
694            neg10: splat16((1 << 8) - 10),
695            #[cfg(target_feature = "sse4.1")]
696            bswap: unsafe {
697                mem::transmute::<[u64; 2], __m128i>([
698                    pack8(15, 14, 13, 12, 11, 10, 9, 8),
699                    pack8(7, 6, 5, 4, 3, 2, 1, 0),
700                ])
701            },
702            #[cfg(not(target_feature = "sse4.1"))]
703            hundred: splat32(100),
704            #[cfg(not(target_feature = "sse4.1"))]
705            moddiv10: splat16(10 * (1 << 8) - 1),
706            zeros: splat64(ZEROS),
707        };
708
709        let div10k = unsafe { _mm_load_si128(&CONSTS.div10k) };
710        let neg10k = unsafe { _mm_load_si128(&CONSTS.neg10k) };
711        let div100 = unsafe { _mm_load_si128(&CONSTS.div100) };
712        let div10 = unsafe { _mm_load_si128(&CONSTS.div10) };
713        #[cfg(target_feature = "sse4.1")]
714        let neg100 = unsafe { _mm_load_si128(&CONSTS.neg100) };
715        #[cfg(target_feature = "sse4.1")]
716        let neg10 = unsafe { _mm_load_si128(&CONSTS.neg10) };
717        #[cfg(target_feature = "sse4.1")]
718        let bswap = unsafe { _mm_load_si128(&CONSTS.bswap) };
719        #[cfg(not(target_feature = "sse4.1"))]
720        let hundred = unsafe { _mm_load_si128(&CONSTS.hundred) };
721        #[cfg(not(target_feature = "sse4.1"))]
722        let moddiv10 = unsafe { _mm_load_si128(&CONSTS.moddiv10) };
723        let zeros = unsafe { _mm_load_si128(&CONSTS.zeros) };
724
725        // The BCD sequences are based on ones provided by Xiang JunBo.
726        unsafe {
727            let x: __m128i = _mm_set_epi64x(i64::from(abcdefgh), i64::from(ijklmnop));
728            let y: __m128i = _mm_add_epi64(
729                x,
730                _mm_mul_epu32(neg10k, _mm_srli_epi64(_mm_mul_epu32(x, div10k), DIV10K_EXP)),
731            );
732
733            #[cfg(target_feature = "sse4.1")]
734            let bcd: __m128i = {
735                // _mm_mullo_epi32 is SSE 4.1
736                let z: __m128i = _mm_add_epi64(
737                    y,
738                    _mm_mullo_epi32(neg100, _mm_srli_epi32(_mm_mulhi_epu16(y, div100), 3)),
739                );
740                let big_endian_bcd: __m128i =
741                    _mm_add_epi64(z, _mm_mullo_epi16(neg10, _mm_mulhi_epu16(z, div10)));
742                // SSSE3
743                _mm_shuffle_epi8(big_endian_bcd, bswap)
744            };
745
746            #[cfg(not(target_feature = "sse4.1"))]
747            let bcd: __m128i = {
748                let y_div_100: __m128i = _mm_srli_epi16(_mm_mulhi_epu16(y, div100), 3);
749                let y_mod_100: __m128i = _mm_sub_epi16(y, _mm_mullo_epi16(y_div_100, hundred));
750                let z: __m128i = _mm_or_si128(_mm_slli_epi32(y_mod_100, 16), y_div_100);
751                let bcd_shuffled: __m128i = _mm_sub_epi16(
752                    _mm_slli_epi16(z, 8),
753                    _mm_mullo_epi16(moddiv10, _mm_mulhi_epu16(z, div10)),
754                );
755                _mm_shuffle_epi32(bcd_shuffled, _MM_SHUFFLE(0, 1, 2, 3))
756            };
757
758            let digits = _mm_or_si128(bcd, zeros);
759
760            // determine number of leading zeros
761            let mask128: __m128i = _mm_cmpgt_epi8(bcd, _mm_setzero_si128());
762            let mask = _mm_movemask_epi8(mask128) as u32;
763            // We don't need a zero-check here: if the mask were zero, either
764            // the significand is zero which is handled elsewhere or the only
765            // non-zero digit is the last digit which we factored off. But in
766            // that case the number would be printed with a different exponent
767            // that shifts the last digit into the first position.
768            let len = 32 - mask.leading_zeros() as usize;
769
770            _mm_storeu_si128(buffer.cast::<__m128i>(), digits);
771            buffer.add(if last_digit != 0 { 17 } else { len })
772        }
773    }
774}
775
776// Writes a significand consisting of up to 9 decimal digits (8-9 for normals)
777// and removes trailing zeros.
778#[cfg_attr(feature = "no-panic", no_panic)]
779unsafe fn write_significand9(mut buffer: *mut u8, value: u32, has9digits: bool) -> *mut u8 {
780    buffer = unsafe { write_if(buffer, value / 100_000_000, has9digits) };
781    let bcd = to_bcd8(u64::from(value % 100_000_000));
782    unsafe {
783        write8(buffer, bcd | ZEROS);
784        buffer.add(count_trailing_nonzeros(bcd))
785    }
786}
787
788struct ToDecimalResult {
789    sig: i64,
790    exp: i32,
791    #[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(miri)))]
792    sig_div10: i64,
793}
794
795fn normalize<UInt>(mut dec: ToDecimalResult, subnormal: bool) -> ToDecimalResult
796where
797    UInt: traits::UInt,
798{
799    if !subnormal {
800        return dec;
801    }
802    let num_bits = mem::size_of::<UInt>() * 8;
803    while dec.sig
804        < if num_bits == 64 {
805            10_000_000_000_000_000
806        } else {
807            100_000_000
808        }
809    {
810        dec.sig *= 10;
811        dec.exp -= 1;
812    }
813    #[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(miri)))]
814    {
815        dec.sig_div10 = dec.sig / 10;
816    }
817    dec
818}
819
820#[cfg_attr(feature = "no-panic", no_panic)]
821fn to_decimal_schubfach<const SUBNORMAL: bool, UInt>(
822    bin_sig: UInt,
823    bin_exp: i64,
824    regular: bool,
825) -> ToDecimalResult
826where
827    UInt: traits::UInt,
828{
829    let num_bits = mem::size_of::<UInt>() as i32 * 8;
830    let dec_exp = compute_dec_exp(bin_exp as i32, regular);
831    let exp_shift = unsafe { compute_exp_shift::<UInt, false>(bin_exp as i32, dec_exp) };
832    let mut pow10 = unsafe { POW10_SIGNIFICANDS.get_unchecked(-dec_exp) };
833
834    // Fallback to Schubfach to guarantee correctness in boundary cases. This
835    // requires switching to strict overestimates of powers of 10.
836    if num_bits == 64 {
837        pow10.lo += 1;
838    } else {
839        pow10.hi += 1;
840    }
841
842    // Shift the significand so that boundaries are integer.
843    const BOUND_SHIFT: u32 = 2;
844    let bin_sig_shifted = bin_sig << BOUND_SHIFT;
845
846    // Compute the estimates of lower and upper bounds of the rounding interval
847    // by multiplying them by the power of 10 and applying modified rounding.
848    let lsb = bin_sig & UInt::from(1);
849    let lower = (bin_sig_shifted - (UInt::from(regular) + UInt::from(1))) << exp_shift;
850    let lower = umulhi_inexact_to_odd(pow10.hi, pow10.lo, lower) + lsb;
851    let upper = (bin_sig_shifted + UInt::from(2)) << exp_shift;
852    let upper = umulhi_inexact_to_odd(pow10.hi, pow10.lo, upper) - lsb;
853
854    // The idea of using a single shorter candidate is by Cassio Neri.
855    // It is less or equal to the upper bound by construction.
856    let div10 = (upper >> BOUND_SHIFT) / UInt::from(10);
857    let shorter = div10 * UInt::from(10);
858    if (shorter << BOUND_SHIFT) >= lower {
859        let result = ToDecimalResult {
860            sig: shorter.into() as i64,
861            exp: dec_exp,
862            #[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(miri)))]
863            sig_div10: div10.into() as i64,
864        };
865        return normalize::<UInt>(result, SUBNORMAL);
866    }
867
868    let scaled_sig = umulhi_inexact_to_odd(pow10.hi, pow10.lo, bin_sig_shifted << exp_shift);
869    let longer_below = scaled_sig >> BOUND_SHIFT;
870    let longer_above = longer_below + UInt::from(1);
871
872    // Pick the closest of longer_below and longer_above and check if it's in
873    // the rounding interval.
874    let cmp = scaled_sig
875        .wrapping_sub((longer_below + longer_above) << 1)
876        .to_signed();
877    let below_closer = cmp < UInt::from(0).to_signed()
878        || (cmp == UInt::from(0).to_signed() && (longer_below & UInt::from(1)) == UInt::from(0));
879    let below_in = (longer_below << BOUND_SHIFT) >= lower;
880    let dec_sig = if below_closer & below_in {
881        longer_below
882    } else {
883        longer_above
884    };
885    let result = ToDecimalResult {
886        sig: dec_sig.into() as i64,
887        exp: dec_exp,
888        #[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(miri)))]
889        sig_div10: (dec_sig / UInt::from(10)).into() as i64,
890    };
891    normalize::<UInt>(result, SUBNORMAL)
892}
893
894// Here be 🐉s.
895// Converts a binary FP number bin_sig * 2**bin_exp to the shortest decimal
896// representation, where bin_exp = raw_exp - exp_offset.
897#[cfg_attr(feature = "no-panic", no_panic)]
898fn to_decimal_normal<Float, UInt>(bin_sig: UInt, raw_exp: i64, regular: bool) -> ToDecimalResult
899where
900    Float: FloatTraits,
901    UInt: traits::UInt,
902{
903    let bin_exp = raw_exp - i64::from(Float::EXP_OFFSET);
904    let num_bits = mem::size_of::<UInt>() as i32 * 8;
905    // An optimization from yy by Yaoyuan Guo:
906    while regular {
907        let dec_exp = if USE_UMUL128_HI64 {
908            umul128_hi64(bin_exp as u64, 0x4d10500000000000) as i32
909        } else {
910            compute_dec_exp(bin_exp as i32, true)
911        };
912        let exp_shift = unsafe { compute_exp_shift::<UInt, true>(bin_exp as i32, dec_exp) };
913        let pow10 = unsafe { POW10_SIGNIFICANDS.get_unchecked(-dec_exp) };
914
915        let integral; // integral part of bin_sig * pow10
916        let fractional; // fractional part of bin_sig * pow10
917        if num_bits == 64 {
918            let p = umul192_hi128(pow10.hi, pow10.lo, (bin_sig << exp_shift).into());
919            integral = UInt::truncate(p.hi);
920            fractional = p.lo;
921        } else {
922            let p = umul128(pow10.hi, (bin_sig << exp_shift).into());
923            integral = UInt::truncate((p >> 64) as u64);
924            fractional = p as u64;
925        }
926        const HALF_ULP: u64 = 1 << 63;
927
928        // Exact half-ulp tie when rounding to nearest integer.
929        let cmp = fractional.wrapping_sub(HALF_ULP) as i64;
930        if cmp == 0 {
931            break;
932        }
933
934        // An optimization of integral % 10 by Dougall Johnson. Relies on range
935        // calculation: (max_bin_sig << max_exp_shift) * max_u128.
936        // (1 << 63) / 5 == (1 << 64) / 10 without an intermediate int128.
937        const DIV10_SIG64: u64 = (1 << 63) / 5 + 1;
938        let div10 = umul128_hi64(integral.into(), DIV10_SIG64);
939        #[allow(unused_mut)]
940        let mut digit = integral.into() - div10 * 10;
941        // or it narrows to 32-bit and doesn't use madd/msub
942        #[cfg(all(any(target_arch = "aarch64", target_arch = "x86_64"), not(miri)))]
943        unsafe {
944            asm!("/*{0}*/", inout(reg) digit);
945        }
946
947        // Switch to a fixed-point representation with the least significant
948        // integral digit in the upper bits and fractional digits in the lower
949        // bits.
950        let num_integral_bits = if num_bits == 64 { 4 } else { 32 };
951        let num_fractional_bits = 64 - num_integral_bits;
952        let ten = 10u64 << num_fractional_bits;
953        // Fixed-point remainder of the scaled significand modulo 10.
954        let scaled_sig_mod10 = (digit << num_fractional_bits) | (fractional >> num_integral_bits);
955
956        // scaled_half_ulp = 0.5 * pow10 in the fixed-point format.
957        // dec_exp is chosen so that 10**dec_exp <= 2**bin_exp < 10**(dec_exp + 1).
958        // Since 1ulp == 2**bin_exp it will be in the range [1, 10) after scaling
959        // by 10**dec_exp. Add 1 to combine the shift with division by two.
960        let scaled_half_ulp = pow10.hi >> (num_integral_bits - exp_shift + 1);
961        let upper = scaled_sig_mod10 + scaled_half_ulp;
962
963        // value = 5.0507837461e-27
964        // next  = 5.0507837461000010e-27
965        //
966        // c = integral.fractional' = 50507837461000003.153987... (value)
967        //                            50507837461000010.328635... (next)
968        //          scaled_half_ulp =                 3.587324...
969        //
970        // fractional' = fractional / 2**64, fractional = 2840565642863009226
971        //
972        //      50507837461000000       c               upper     50507837461000010
973        //              s              l|   L             |               S
974        // ───┬────┬────┼────┬────┬────┼*-──┼────┬────┬───*┬────┬────┬────┼-*--┬───
975        //    8    9    0    1    2    3    4    5    6    7    8    9    0 |  1
976        //            └─────────────────┼─────────────────┘                next
977        //                             1ulp
978        //
979        // s - shorter underestimate, S - shorter overestimate
980        // l - longer underestimate,  L - longer overestimate
981
982        // Check for boundary case when rounding down to nearest 10 and
983        // near-boundary case when rounding up to nearest 10.
984        if scaled_sig_mod10 == scaled_half_ulp
985            // Case where upper == ten is insufficient: 1.342178e+08f.
986            // upper == ten || upper == ten - 1
987            || ten.wrapping_sub(upper) <= 1
988        {
989            break;
990        }
991
992        let round_up = upper >= ten;
993        let mut shorter = (integral.into() - digit) as i64;
994        let longer = (integral.into() + u64::from(cmp >= 0)) as i64;
995        if falsecfg!(target_arch = "aarch64") {
996            // Faster version without ccmp.
997            let dec_sig =
998                hint::select_unpredictable(scaled_sig_mod10 < scaled_half_ulp, shorter, longer);
999            return ToDecimalResult {
1000                sig: hint::select_unpredictable(round_up, shorter + 10, dec_sig),
1001                exp: dec_exp,
1002                #[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(miri)))]
1003                sig_div10: 0,
1004            };
1005        }
1006        shorter += i64::from(round_up) * 10;
1007        let use_shorter = scaled_sig_mod10 <= scaled_half_ulp || round_up;
1008        return ToDecimalResult {
1009            sig: hint::select_unpredictable(use_shorter, shorter, longer),
1010            exp: dec_exp,
1011            #[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(miri)))]
1012            sig_div10: div10 as i64 + i64::from(use_shorter) * i64::from(round_up),
1013        };
1014    }
1015    to_decimal_schubfach::<false, UInt>(bin_sig, bin_exp, regular)
1016}
1017
1018/// Writes the shortest correctly rounded decimal representation of `value` to
1019/// `buffer`. `buffer` should point to a buffer of size `buffer_size` or larger.
1020#[cfg_attr(feature = "no-panic", no_panic)]
1021unsafe fn write<Float>(value: Float, mut buffer: *mut u8) -> *mut u8
1022where
1023    Float: FloatTraits,
1024{
1025    let bits = value.to_bits();
1026    // It is beneficial to extract exponent and significand early.
1027    let bin_exp = Float::get_exp(bits); // binary exponent
1028    let bin_sig = Float::get_sig(bits); // binary significand
1029
1030    unsafe {
1031        *buffer = b'-';
1032    }
1033    buffer = unsafe { buffer.add(usize::from(Float::is_negative(bits))) };
1034
1035    let mut dec = if bin_exp == 0 {
1036        if bin_sig == Float::SigType::from(0) {
1037            return unsafe {
1038                *buffer = b'0';
1039                *buffer.add(1) = b'.';
1040                *buffer.add(2) = b'0';
1041                buffer.add(3)
1042            };
1043        }
1044        to_decimal_schubfach::<true, Float::SigType>(
1045            bin_sig,
1046            i64::from(1 - Float::EXP_OFFSET),
1047            true,
1048        )
1049    } else {
1050        to_decimal_normal::<Float, Float::SigType>(
1051            bin_sig | Float::IMPLICIT_BIT,
1052            bin_exp,
1053            bin_sig != Float::SigType::from(0),
1054        )
1055    };
1056    let mut dec_exp = dec.exp;
1057
1058    // Write significand.
1059    let end = if Float::NUM_BITS == 64 {
1060        let has17digits = dec.sig >= 10_000_000_000_000_000;
1061        dec_exp += Float::MAX_DIGITS10 as i32 - 2 + i32::from(has17digits);
1062        unsafe {
1063            write_significand17(
1064                buffer,
1065                dec.sig as u64,
1066                has17digits,
1067                #[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(miri)))]
1068                dec.sig_div10,
1069            )
1070        }
1071    } else {
1072        if dec.sig < 10_000_000 {
1073            dec.sig *= 10;
1074            dec_exp -= 1;
1075        }
1076        let has9digits = dec.sig >= 100_000_000;
1077        dec_exp += Float::MAX_DIGITS10 as i32 - 2 + i32::from(has9digits);
1078        unsafe { write_significand9(buffer.add(1), dec.sig as u32, has9digits) }
1079    };
1080
1081    let length = unsafe { end.offset_from(buffer.add(1)) } as usize;
1082
1083    if Float::NUM_BITS == 32 && (-6..=12).contains(&dec_exp)
1084        || Float::NUM_BITS == 64 && (-5..=15).contains(&dec_exp)
1085    {
1086        if length as i32 - 1 <= dec_exp {
1087            // 1234e7 -> 12340000000.0
1088            return unsafe {
1089                ptr::copy(buffer.add(1), buffer, length);
1090                ptr::write_bytes(buffer.add(length), b'0', dec_exp as usize + 3 - length);
1091                *buffer.add(dec_exp as usize + 1) = b'.';
1092                buffer.add(dec_exp as usize + 3)
1093            };
1094        } else if 0 <= dec_exp {
1095            // 1234e-2 -> 12.34
1096            return unsafe {
1097                ptr::copy(buffer.add(1), buffer, dec_exp as usize + 1);
1098                *buffer.add(dec_exp as usize + 1) = b'.';
1099                buffer.add(length + 1)
1100            };
1101        } else {
1102            // 1234e-6 -> 0.001234
1103            return unsafe {
1104                ptr::copy(buffer.add(1), buffer.add((1 - dec_exp) as usize), length);
1105                ptr::write_bytes(buffer, b'0', (1 - dec_exp) as usize);
1106                *buffer.add(1) = b'.';
1107                buffer.add((1 - dec_exp) as usize + length)
1108            };
1109        }
1110    }
1111
1112    unsafe {
1113        // 1234e30 -> 1.234e33
1114        *buffer = *buffer.add(1);
1115        *buffer.add(1) = b'.';
1116    }
1117    buffer = unsafe { buffer.add(length + usize::from(length > 1)) };
1118
1119    // Write exponent.
1120    let sign_ptr = buffer;
1121    let e_sign = if dec_exp >= 0 {
1122        (u16::from(b'+') << 8) | u16::from(b'e')
1123    } else {
1124        (u16::from(b'-') << 8) | u16::from(b'e')
1125    };
1126    buffer = unsafe { buffer.add(1) };
1127    dec_exp = if dec_exp >= 0 { dec_exp } else { -dec_exp };
1128    buffer = unsafe { buffer.add(usize::from(dec_exp >= 10)) };
1129    if Float::MIN_10_EXP >= -99 && Float::MAX_10_EXP <= 99 {
1130        unsafe {
1131            buffer
1132                .cast::<u16>()
1133                .write_unaligned(*digits2(dec_exp as usize));
1134            sign_ptr.cast::<u16>().write_unaligned(e_sign.to_le());
1135            return buffer.add(2);
1136        }
1137    }
1138
1139    // digit = dec_exp / 100
1140    let digit = if USE_UMUL128_HI64 {
1141        umul128_hi64(dec_exp as u64, 0x290000000000000) as u32
1142    } else {
1143        (dec_exp as u32 * DIV100_SIG) >> DIV100_EXP
1144    };
1145    unsafe {
1146        *buffer = b'0' + digit as u8;
1147    }
1148    buffer = unsafe { buffer.add(usize::from(dec_exp >= 100)) };
1149    unsafe {
1150        buffer
1151            .cast::<u16>()
1152            .write_unaligned(*digits2((dec_exp as u32 - digit * 100) as usize));
1153        sign_ptr.cast::<u16>().write_unaligned(e_sign.to_le());
1154        buffer.add(2)
1155    }
1156}
1157
1158/// Safe API for formatting floating point numbers to text.
1159///
1160/// ## Example
1161///
1162/// ```
1163/// let mut buffer = zmij::Buffer::new();
1164/// let printed = buffer.format_finite(1.234);
1165/// assert_eq!(printed, "1.234");
1166/// ```
1167pub struct Buffer {
1168    bytes: [MaybeUninit<u8>; BUFFER_SIZE],
1169}
1170
1171impl Buffer {
1172    /// This is a cheap operation; you don't need to worry about reusing buffers
1173    /// for efficiency.
1174    #[inline]
1175    #[cfg_attr(feature = "no-panic", no_panic)]
1176    pub fn new() -> Self {
1177        let bytes = [MaybeUninit::<u8>::uninit(); BUFFER_SIZE];
1178        Buffer { bytes }
1179    }
1180
1181    /// Print a floating point number into this buffer and return a reference to
1182    /// its string representation within the buffer.
1183    ///
1184    /// # Special cases
1185    ///
1186    /// This function formats NaN as the string "NaN", positive infinity as
1187    /// "inf", and negative infinity as "-inf" to match std::fmt.
1188    ///
1189    /// If your input is known to be finite, you may get better performance by
1190    /// calling the `format_finite` method instead of `format` to avoid the
1191    /// checks for special cases.
1192    #[cfg_attr(feature = "no-panic", no_panic)]
1193    pub fn format<F: Float>(&mut self, f: F) -> &str {
1194        if f.is_nonfinite() {
1195            f.format_nonfinite()
1196        } else {
1197            self.format_finite(f)
1198        }
1199    }
1200
1201    /// Print a floating point number into this buffer and return a reference to
1202    /// its string representation within the buffer.
1203    ///
1204    /// # Special cases
1205    ///
1206    /// This function **does not** check for NaN or infinity. If the input
1207    /// number is not a finite float, the printed representation will be some
1208    /// correctly formatted but unspecified numerical value.
1209    ///
1210    /// Please check [`is_finite`] yourself before calling this function, or
1211    /// check [`is_nan`] and [`is_infinite`] and handle those cases yourself.
1212    ///
1213    /// [`is_finite`]: f64::is_finite
1214    /// [`is_nan`]: f64::is_nan
1215    /// [`is_infinite`]: f64::is_infinite
1216    #[cfg_attr(feature = "no-panic", no_panic)]
1217    pub fn format_finite<F: Float>(&mut self, f: F) -> &str {
1218        unsafe {
1219            let end = f.write_to_zmij_buffer(self.bytes.as_mut_ptr().cast::<u8>());
1220            let len = end.offset_from(self.bytes.as_ptr().cast::<u8>()) as usize;
1221            let slice = slice::from_raw_parts(self.bytes.as_ptr().cast::<u8>(), len);
1222            str::from_utf8_unchecked(slice)
1223        }
1224    }
1225}
1226
1227/// A floating point number, f32 or f64, that can be written into a
1228/// [`zmij::Buffer`][Buffer].
1229///
1230/// This trait is sealed and cannot be implemented for types outside of the
1231/// `zmij` crate.
1232#[allow(unknown_lints)] // rustc older than 1.74
1233#[allow(private_bounds)]
1234pub trait Float: private::Sealed {}
1235impl Float for f32 {}
1236impl Float for f64 {}
1237
1238mod private {
1239    pub trait Sealed: crate::traits::Float {
1240        fn is_nonfinite(self) -> bool;
1241        fn format_nonfinite(self) -> &'static str;
1242        unsafe fn write_to_zmij_buffer(self, buffer: *mut u8) -> *mut u8;
1243    }
1244
1245    impl Sealed for f32 {
1246        #[inline]
1247        fn is_nonfinite(self) -> bool {
1248            const EXP_MASK: u32 = 0x7f800000;
1249            let bits = self.to_bits();
1250            bits & EXP_MASK == EXP_MASK
1251        }
1252
1253        #[cold]
1254        #[cfg_attr(feature = "no-panic", inline)]
1255        fn format_nonfinite(self) -> &'static str {
1256            const MANTISSA_MASK: u32 = 0x007fffff;
1257            const SIGN_MASK: u32 = 0x80000000;
1258            let bits = self.to_bits();
1259            if bits & MANTISSA_MASK != 0 {
1260                crate::NAN
1261            } else if bits & SIGN_MASK != 0 {
1262                crate::NEG_INFINITY
1263            } else {
1264                crate::INFINITY
1265            }
1266        }
1267
1268        #[cfg_attr(feature = "no-panic", inline)]
1269        unsafe fn write_to_zmij_buffer(self, buffer: *mut u8) -> *mut u8 {
1270            unsafe { crate::write(self, buffer) }
1271        }
1272    }
1273
1274    impl Sealed for f64 {
1275        #[inline]
1276        fn is_nonfinite(self) -> bool {
1277            const EXP_MASK: u64 = 0x7ff0000000000000;
1278            let bits = self.to_bits();
1279            bits & EXP_MASK == EXP_MASK
1280        }
1281
1282        #[cold]
1283        #[cfg_attr(feature = "no-panic", inline)]
1284        fn format_nonfinite(self) -> &'static str {
1285            const MANTISSA_MASK: u64 = 0x000fffffffffffff;
1286            const SIGN_MASK: u64 = 0x8000000000000000;
1287            let bits = self.to_bits();
1288            if bits & MANTISSA_MASK != 0 {
1289                crate::NAN
1290            } else if bits & SIGN_MASK != 0 {
1291                crate::NEG_INFINITY
1292            } else {
1293                crate::INFINITY
1294            }
1295        }
1296
1297        #[cfg_attr(feature = "no-panic", inline)]
1298        unsafe fn write_to_zmij_buffer(self, buffer: *mut u8) -> *mut u8 {
1299            unsafe { crate::write(self, buffer) }
1300        }
1301    }
1302}
1303
1304impl Default for Buffer {
1305    #[inline]
1306    #[cfg_attr(feature = "no-panic", no_panic)]
1307    fn default() -> Self {
1308        Buffer::new()
1309    }
1310}