Skip to main content

libm/math/arch/x86/
detect.rs

1// Using runtime feature detection requires atomics. Currently there are no x86 targets
2// that support sse but not `AtomicPtr`.
3
4#[cfg(target_arch = "x86")]
5use core::arch::x86::{__cpuid, __cpuid_count, _xgetbv, CpuidResult};
6#[cfg(target_arch = "x86_64")]
7use core::arch::x86_64::{__cpuid, __cpuid_count, _xgetbv, CpuidResult};
8
9use crate::support::feature_detect::{Flags, get_or_init_flags_cache, unique_masks};
10
11/// CPU features that get cached (doesn't correlate to anything on the CPU).
12pub mod cpu_flags {
13    use super::unique_masks;
14
15    pub const AVX512BF16: u32 = 1 << 0 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1;
const _: () =
    if !(AVX512BF16 != (1 << (<u32>::BITS - 1))) {
        ::core::panicking::panic("assertion failed: AVX512BF16 != (1 << (<u32>::BITS - 1))")
    };unique_masks! {
16        u32,
17        SSE3,
18        F16C,
19        SSE,
20        SSE2,
21        ERMSB,
22        MOVRS,
23        FMA,
24        FMA4,
25        AVX512FP16,
26        AVX512BF16,
27    }
28}
29
30/// Get CPU features, loading from a cache if available.
31pub fn get_cpu_features() -> Flags {
32    use core::sync::atomic::AtomicU32;
33    static CACHE: AtomicU32 = AtomicU32::new(0);
34    get_or_init_flags_cache(&CACHE, load_x86_features)
35}
36
37/// Read from cpuid and translate to a `Flags` instance, using `cpu_flags`.
38///
39/// Implementation is taken from [std-detect][std-detect].
40///
41/// [std-detect]: https://github.com/rust-lang/stdarch/blob/690b3a6334d482874163bd6fcef408e0518febe9/crates/std_detect/src/detect/os/x86.rs#L142
42// FIXME(msrv): Remove unsafe block around __cpuid once https://github.com/rust-lang/stdarch/pull/1935 is available in MSRV.
43#[allow(unused_unsafe)]
44fn load_x86_features() -> Flags {
45    let mut value = Flags::empty();
46
47    if falsecfg!(target_env = "sgx") {
48        // doesn't support this because it is untrusted data
49        return Flags::empty();
50    }
51
52    // Calling `__cpuid`/`__cpuid_count` from here on is safe because the CPU
53    // has `cpuid` support.
54
55    // 0. EAX = 0: Basic Information:
56    // - EAX returns the "Highest Function Parameter", that is, the maximum leaf
57    //   value for subsequent calls of `cpuinfo` in range [0, 0x8000_0000].
58    // - The vendor ID is stored in 12 u8 ascii chars, returned in EBX, EDX, and ECX
59    //   (in that order)
60    let mut vendor_id = [0u8; 12];
61    let max_basic_leaf;
62    unsafe {
63        let CpuidResult { eax, ebx, ecx, edx } = __cpuid(0);
64        max_basic_leaf = eax;
65        vendor_id[0..4].copy_from_slice(&ebx.to_ne_bytes());
66        vendor_id[4..8].copy_from_slice(&edx.to_ne_bytes());
67        vendor_id[8..12].copy_from_slice(&ecx.to_ne_bytes());
68    }
69
70    if max_basic_leaf < 1 {
71        // Earlier Intel 486, CPUID not implemented
72        return value;
73    }
74
75    // EAX = 1, ECX = 0: Queries "Processor Info and Feature Bits";
76    // Contains information about most x86 features.
77    let CpuidResult { ecx, edx, .. } = unsafe { __cpuid(0x0000_0001_u32) };
78    let proc_info_ecx = Flags::from_bits(ecx);
79    let proc_info_edx = Flags::from_bits(edx);
80
81    // EAX = 7: Queries "Extended Features";
82    // Contains information about bmi,bmi2, and avx2 support.
83    let mut extended_features_ebx = Flags::empty();
84    let mut extended_features_edx = Flags::empty();
85    let mut extended_features_eax_leaf_1 = Flags::empty();
86    if max_basic_leaf >= 7 {
87        let CpuidResult { ebx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
88        extended_features_ebx = Flags::from_bits(ebx);
89        extended_features_edx = Flags::from_bits(edx);
90
91        let CpuidResult { eax, .. } = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
92        extended_features_eax_leaf_1 = Flags::from_bits(eax)
93    }
94
95    // EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
96    // - EAX returns the max leaf value for extended information, that is,
97    //   `cpuid` calls in range [0x8000_0000; u32::MAX]:
98    let extended_max_basic_leaf = unsafe { __cpuid(0x8000_0000_u32) }.eax;
99
100    // EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature Bits"
101    let mut extended_proc_info_ecx = Flags::empty();
102    if extended_max_basic_leaf >= 1 {
103        let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) };
104        extended_proc_info_ecx = Flags::from_bits(ecx);
105    }
106
107    let mut enable = |regflags: Flags, regbit, flag| {
108        if regflags.test_nth(regbit) {
109            value.insert(flag);
110        }
111    };
112
113    enable(proc_info_ecx, 0, cpu_flags::SSE3);
114    enable(proc_info_ecx, 29, cpu_flags::F16C);
115    enable(proc_info_edx, 25, cpu_flags::SSE);
116    enable(proc_info_edx, 26, cpu_flags::SSE2);
117    enable(extended_features_ebx, 9, cpu_flags::ERMSB);
118    enable(extended_features_eax_leaf_1, 31, cpu_flags::MOVRS);
119
120    // `XSAVE` and `AVX` support:
121    let cpu_xsave = proc_info_ecx.test_nth(26);
122    if cpu_xsave {
123        // 0. Here the CPU supports `XSAVE`.
124
125        // 1. Detect `OSXSAVE`, that is, whether the OS is AVX enabled and
126        //    supports saving the state of the AVX/AVX2 vector registers on
127        //    context-switches, see:
128        //
129        // - [intel: is avx enabled?][is_avx_enabled],
130        // - [mozilla: sse.cpp][mozilla_sse_cpp].
131        //
132        // [is_avx_enabled]: https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled
133        // [mozilla_sse_cpp]: https://hg.mozilla.org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190
134        let cpu_osxsave = proc_info_ecx.test_nth(27);
135
136        if cpu_osxsave {
137            // 2. The OS must have signaled the CPU that it supports saving and
138            // restoring the:
139            //
140            // * SSE -> `XCR0.SSE[1]`
141            // * AVX -> `XCR0.AVX[2]`
142            // * AVX-512 -> `XCR0.AVX-512[7:5]`.
143            // * AMX -> `XCR0.AMX[18:17]`
144            //
145            // by setting the corresponding bits of `XCR0` to `1`.
146            //
147            // This is safe because the CPU supports `xsave` and the OS has set `osxsave`.
148            let xcr0 = unsafe { _xgetbv(0) };
149            // Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
150            let os_avx_support = xcr0 & 6 == 6;
151            // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`:
152            let os_avx512_support = xcr0 & 0xe0 == 0xe0;
153
154            // Only if the OS and the CPU support saving/restoring the AVX
155            // registers we enable `xsave` support:
156            if os_avx_support {
157                // See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED
158                // FEATURES" in the "Intel® 64 and IA-32 Architectures Software
159                // Developer’s Manual, Volume 1: Basic Architecture":
160                //
161                // "Software enables the XSAVE feature set by setting
162                // CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4
163                // instruction). If this bit is 0, execution of any of XGETBV,
164                // XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV
165                // causes an invalid-opcode exception (#UD)"
166
167                // FMA (uses 256-bit wide registers):
168                enable(proc_info_ecx, 12, cpu_flags::FMA);
169
170                // For AVX-512 the OS also needs to support saving/restoring
171                // the extended state, only then we enable AVX-512 support:
172                if os_avx512_support {
173                    enable(extended_features_edx, 23, cpu_flags::AVX512FP16);
174                    enable(extended_features_eax_leaf_1, 5, cpu_flags::AVX512BF16);
175                }
176            }
177        }
178    }
179
180    // As Hygon Dhyana originates from AMD technology and shares most of the architecture with
181    // AMD's family 17h, but with different CPU Vendor ID("HygonGenuine")/Family series number
182    // (Family 18h).
183    //
184    // For CPUID feature bits, Hygon Dhyana(family 18h) share the same definition with AMD
185    // family 17h.
186    //
187    // Related AMD CPUID specification is https://www.amd.com/system/files/TechDocs/25481.pdf
188    // (AMD64 Architecture Programmer's Manual, Appendix E).
189    // Related Hygon kernel patch can be found on
190    // http://lkml.kernel.org/r/5ce86123a7b9dad925ac583d88d2f921040e859b.1538583282.git.puwen@hygon.cn
191    if vendor_id == *b"AuthenticAMD" || vendor_id == *b"HygonGenuine" {
192        // These features are available on AMD arch CPUs:
193        enable(extended_proc_info_ecx, 16, cpu_flags::FMA4);
194    }
195
196    value
197}
198
199#[cfg(test)]
200mod tests {
201    extern crate std;
202    use std::is_x86_feature_detected;
203
204    use super::*;
205
206    #[test]
207    fn check_matches_std() {
208        let features = get_cpu_features();
209        for i in 0..cpu_flags::ALL.len() {
210            let flag = cpu_flags::ALL[i];
211            let name = cpu_flags::NAMES[i];
212
213            let std_detected = match flag {
214                cpu_flags::SSE3 => is_x86_feature_detected!("sse3"),
215                cpu_flags::F16C => is_x86_feature_detected!("f16c"),
216                cpu_flags::SSE => is_x86_feature_detected!("sse"),
217                cpu_flags::SSE2 => is_x86_feature_detected!("sse2"),
218                cpu_flags::ERMSB => is_x86_feature_detected!("ermsb"),
219                cpu_flags::MOVRS => continue, // only very recent support in std
220                cpu_flags::FMA => is_x86_feature_detected!("fma"),
221                cpu_flags::FMA4 => continue, // not yet supported in std
222                cpu_flags::AVX512FP16 => is_x86_feature_detected!("avx512fp16"),
223                cpu_flags::AVX512BF16 => is_x86_feature_detected!("avx512bf16"),
224                _ => panic!("untested CPU flag {name}"),
225            };
226
227            assert_eq!(
228                std_detected,
229                features.contains(flag),
230                "different flag {name}. flags: {features:?}"
231            );
232        }
233    }
234}