diff --git a/Cargo.toml b/Cargo.toml index 8c51848af8..8cb48b8b49 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,6 @@ repository = "https://github.com/briansmith/ring" # Keep in sync with .github/workflows/ci.yml ("MSRV") and see the MSRV note # in cpu/arm.rs. -# 1.66 is required on x86/x86_64 for https://github.com/rust-lang/rust/pull/101861. rust-version = "1.66.0" # Keep in sync with `links` below. diff --git a/build.rs b/build.rs index 9843ad8aa5..e78c505763 100644 --- a/build.rs +++ b/build.rs @@ -66,8 +66,6 @@ const RING_SRCS: &[(&[&str], &str)] = &[ (&[ARM, X86_64, X86], "crypto/crypto.c"), - (&[X86_64, X86], "crypto/cpu_intel.c"), - (&[X86], "crypto/fipsmodule/aes/asm/aesni-x86.pl"), (&[X86], "crypto/fipsmodule/aes/asm/ghash-x86.pl"), (&[X86], "crypto/fipsmodule/aes/asm/vpaes-x86.pl"), @@ -886,7 +884,6 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { "LIMBS_window5_split_window", "LIMBS_window5_unsplit_window", "LIMB_shr", - "OPENSSL_cpuid_setup", "aes_gcm_dec_kernel", "aes_gcm_dec_update_vaes_avx2", "aes_gcm_enc_kernel", diff --git a/crypto/cpu_intel.c b/crypto/cpu_intel.c deleted file mode 100644 index 6e792b6ba4..0000000000 --- a/crypto/cpu_intel.c +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - - -#if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_X86_64)) - -#if defined(_MSC_VER) && !defined(__clang__) -#pragma warning(push, 3) -#include -#include -#pragma warning(pop) -#endif - -#include "internal.h" - - -// OPENSSL_cpuid runs the cpuid instruction. |leaf| is passed in as EAX and ECX -// is set to zero. It writes EAX, EBX, ECX, and EDX to |*out_eax| through -// |*out_edx|. -static void OPENSSL_cpuid(uint32_t *out_eax, uint32_t *out_ebx, - uint32_t *out_ecx, uint32_t *out_edx, uint32_t leaf) { -#if defined(_MSC_VER) && !defined(__clang__) - int tmp[4]; - __cpuid(tmp, (int)leaf); - *out_eax = (uint32_t)tmp[0]; - *out_ebx = (uint32_t)tmp[1]; - *out_ecx = (uint32_t)tmp[2]; - *out_edx = (uint32_t)tmp[3]; -#elif defined(__pic__) && defined(OPENSSL_32_BIT) - // Inline assembly may not clobber the PIC register. For 32-bit, this is EBX. - // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602. - __asm__ volatile ( - "xor %%ecx, %%ecx\n" - "mov %%ebx, %%edi\n" - "cpuid\n" - "xchg %%edi, %%ebx\n" - : "=a"(*out_eax), "=D"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx) - : "a"(leaf) - ); -#else - __asm__ volatile ( - "xor %%ecx, %%ecx\n" - "cpuid\n" - : "=a"(*out_eax), "=b"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx) - : "a"(leaf) - ); -#endif -} - -// OPENSSL_xgetbv returns the value of an Intel Extended Control Register (XCR). -// Currently only XCR0 is defined by Intel so |xcr| should always be zero. -// -// See https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family -static uint64_t OPENSSL_xgetbv(uint32_t xcr) { -#if defined(_MSC_VER) && !defined(__clang__) - return (uint64_t)_xgetbv(xcr); -#else - uint32_t eax, edx; - __asm__ volatile ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); - return (((uint64_t)edx) << 32) | eax; -#endif -} - -void OPENSSL_cpuid_setup(uint32_t OPENSSL_ia32cap_P[4]) { - // Determine the vendor and maximum input value. - uint32_t eax, ebx, ecx, edx; - OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0); - - uint32_t num_ids = eax; - - int is_intel = ebx == 0x756e6547 /* Genu */ && - edx == 0x49656e69 /* ineI */ && - ecx == 0x6c65746e /* ntel */; - - uint32_t extended_features[2] = {0}; - if (num_ids >= 7) { - OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 7); - extended_features[0] = ebx; - extended_features[1] = ecx; - } - - OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 1); - - const uint32_t base_family = (eax >> 8) & 15; - const uint32_t base_model = (eax >> 4) & 15; - - uint32_t family = base_family; - uint32_t model = base_model; - if (base_family == 15) { - const uint32_t ext_family = (eax >> 20) & 255; - family += ext_family; - } - if (base_family == 6 || base_family == 15) { - const uint32_t ext_model = (eax >> 16) & 15; - model |= ext_model << 4; - } - - // Reserved bit #30 is repurposed to signal an Intel CPU. - if (is_intel) { - edx |= (1u << 30); - } else { - edx &= ~(1u << 30); - } - - uint64_t xcr0 = 0; - if (ecx & (1u << 27)) { - // XCR0 may only be queried if the OSXSAVE bit is set. - xcr0 = OPENSSL_xgetbv(0); - } - // See Intel manual, volume 1, section 14.3. - if ((xcr0 & 6) != 6) { - // YMM registers cannot be used. - ecx &= ~(1u << 28); // AVX - ecx &= ~(1u << 12); // FMA - ecx &= ~(1u << 11); // AMD XOP - extended_features[0] &= ~(1u << 5); // AVX2 - extended_features[1] &= ~(1u << 9); // VAES - extended_features[1] &= ~(1u << 10); // VPCLMULQDQ - } - // See Intel manual, volume 1, sections 15.2 ("Detection of AVX-512 Foundation - // Instructions") through 15.4 ("Detection of Intel AVX-512 Instruction Groups - // Operating at 256 and 128-bit Vector Lengths"). - if ((xcr0 & 0xe6) != 0xe6) { - // Without XCR0.111xx11x, no AVX512 feature can be used. This includes ZMM - // registers, masking, SIMD registers 16-31 (even if accessed as YMM or - // XMM), and EVEX-coded instructions (even on YMM or XMM). Even if only - // XCR0.ZMM_Hi256 is missing, it isn't valid to use AVX512 features on - // shorter vectors, since AVX512 ties everything to the availability of - // 512-bit vectors. See the above-mentioned sections of the Intel manual, - // which say that *all* these XCR0 bits must be checked even when just using - // 128-bit or 256-bit vectors, and also volume 2a section 2.7.11 ("#UD - // Equations for EVEX") which says that all EVEX-coded instructions raise an - // undefined-instruction exception if any of these XCR0 bits is zero. - // - // AVX10 fixes this by reorganizing the features that used to be part of - // "AVX512" and allowing them to be used independently of 512-bit support. - // TODO: add AVX10 detection. - extended_features[0] &= ~(1u << 16); // AVX512F - extended_features[0] &= ~(1u << 17); // AVX512DQ - extended_features[0] &= ~(1u << 21); // AVX512IFMA - extended_features[0] &= ~(1u << 26); // AVX512PF - extended_features[0] &= ~(1u << 27); // AVX512ER - extended_features[0] &= ~(1u << 28); // AVX512CD - extended_features[0] &= ~(1u << 30); // AVX512BW - extended_features[0] &= ~(1u << 31); // AVX512VL - extended_features[1] &= ~(1u << 1); // AVX512VBMI - extended_features[1] &= ~(1u << 6); // AVX512VBMI2 - extended_features[1] &= ~(1u << 11); // AVX512VNNI - extended_features[1] &= ~(1u << 12); // AVX512BITALG - extended_features[1] &= ~(1u << 14); // AVX512VPOPCNTDQ - } - - // Repurpose the bit for the removed MPX feature to indicate when using zmm - // registers should be avoided even when they are supported. (When set, AVX512 - // features can still be used, but only using ymm or xmm registers.) Skylake - // suffered from severe downclocking when zmm registers were used, which - // affected unrelated code running on the system, making zmm registers not too - // useful outside of benchmarks. The situation improved significantly by Ice - // Lake, but a small amount of downclocking remained. (See - // https://lore.kernel.org/linux-crypto/e8ce1146-3952-6977-1d0e-a22758e58914@intel.com/) - // We take a conservative approach of not allowing zmm registers until after - // Ice Lake and Tiger Lake, i.e. until Sapphire Rapids on the server side. - // - // AMD CPUs, which support AVX512 starting with Zen 4, have not been reported - // to have any downclocking problem when zmm registers are used. - if (is_intel && family == 6 && - (model == 85 || // Skylake, Cascade Lake, Cooper Lake (server) - model == 106 || // Ice Lake (server) - model == 108 || // Ice Lake (micro server) - model == 125 || // Ice Lake (client) - model == 126 || // Ice Lake (mobile) - model == 140 || // Tiger Lake (mobile) - model == 141)) { // Tiger Lake (client) - extended_features[0] |= 1u << 14; - } else { - extended_features[0] &= ~(1u << 14); - } - - OPENSSL_ia32cap_P[0] = edx; - OPENSSL_ia32cap_P[1] = ecx; - OPENSSL_ia32cap_P[2] = extended_features[0]; - OPENSSL_ia32cap_P[3] = extended_features[1]; -} - -#endif // !OPENSSL_NO_ASM && (OPENSSL_X86 || OPENSSL_X86_64) diff --git a/crypto/internal.h b/crypto/internal.h index 99223d1aca..c7013b857b 100644 --- a/crypto/internal.h +++ b/crypto/internal.h @@ -441,30 +441,10 @@ static inline void CRYPTO_store_u32_be(void *out, uint32_t v) { // Runtime CPU feature support -#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) -// OPENSSL_ia32cap_P contains the Intel CPUID bits when running on an x86 or -// x86-64 system. -// -// Index 0: -// EDX for CPUID where EAX = 1 -// Bit 30 is used to indicate an Intel CPU -// Index 1: -// ECX for CPUID where EAX = 1 -// Index 2: -// EBX for CPUID where EAX = 7, ECX = 0 -// Bit 14 (for removed feature MPX) is used to indicate a preference for ymm -// registers over zmm even when zmm registers are supported -// Index 3: -// ECX for CPUID where EAX = 7, ECX = 0 -// -// Note: the CPUID bits are pre-adjusted for the OSXSAVE bit and the XMM, YMM, -// and AVX512 bits in XCR0, so it is not necessary to check those. (WARNING: See -// caveats in cpu_intel.c.) #if defined(OPENSSL_X86_64) extern uint32_t avx2_available; extern uint32_t adx_bmi2_available; #endif -#endif #if defined(OPENSSL_ARM) diff --git a/src/cpu/intel.rs b/src/cpu/intel.rs index f45052fe7f..000a6fba86 100644 --- a/src/cpu/intel.rs +++ b/src/cpu/intel.rs @@ -12,6 +12,11 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// "Intel" citations are for "Intel 64 and IA-32 Architectures Software +// Developer’s Manual", Combined Volumes, December 2024. +// "AMD" citations are for "AMD64 Technology AMD64 Architecture +// Programmer’s Manual, Volumes 1-5" Revision 4.08 April 2024. + use cfg_if::cfg_if; mod abi_assumptions { @@ -36,7 +41,7 @@ mod abi_assumptions { } pub(super) mod featureflags { - use super::super::CAPS_STATIC; + use super::{super::CAPS_STATIC, *}; use crate::{ cpu, polyfill::{once_cell::race, usize_from_u32}, @@ -44,22 +49,13 @@ pub(super) mod featureflags { use core::num::NonZeroUsize; pub(in super::super) fn get_or_init() -> cpu::Features { - // SAFETY: `OPENSSL_cpuid_setup` must be called only in - // `INIT.call_once()` below. - prefixed_extern! { - fn OPENSSL_cpuid_setup(out: &mut [u32; 4]); - } - let _: NonZeroUsize = FEATURES.get_or_init(|| { - let mut cpuid = [0; 4]; - // SAFETY: We assume that it is safe to execute CPUID and XGETBV. - unsafe { - OPENSSL_cpuid_setup(&mut cpuid); - } - let detected = super::cpuid_to_caps_and_set_c_flags(&cpuid); + // SAFETY: `cpuid_all` assumes CPUID is available and that it is + // compatible with Intel. + let cpuid_results = unsafe { cpuid_all() }; + let detected = cpuid_to_caps_and_set_c_flags(cpuid_results); let merged = CAPS_STATIC | detected; - - let merged = usize_from_u32(merged) | (1 << (super::Shift::Initialized as u32)); + let merged = usize_from_u32(merged) | (1 << (Shift::Initialized as u32)); NonZeroUsize::new(merged).unwrap() // Can't fail because we just set a bit. }); @@ -98,30 +94,120 @@ pub(super) mod featureflags { #[cfg(target_arch = "x86_64")] #[rustfmt::skip] pub const STATIC_DETECTED: u32 = 0 - | if cfg!(target_feature = "sse4.1") { super::Sse41::mask() } else { 0 } - | if cfg!(target_feature = "ssse3") { super::Ssse3::mask() } else { 0 } + | if cfg!(target_feature = "sse4.1") { Sse41::mask() } else { 0 } + | if cfg!(target_feature = "ssse3") { Ssse3::mask() } else { 0 } ; pub const FORCE_DYNAMIC_DETECTION: u32 = 0; } -fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 { - // "Intel" citations are for "Intel 64 and IA-32 Architectures Software - // Developer’s Manual", Combined Volumes, December 2024. - // "AMD" citations are for "AMD64 Technology AMD64 Architecture - // Programmer’s Manual, Volumes 1-5" Revision 4.08 April 2024. +struct CpuidSummary { + #[cfg(target_arch = "x86_64")] + is_intel: bool, + leaf1_edx: u32, + leaf1_ecx: u32, + #[cfg(target_arch = "x86_64")] + extended_features_ecx: u32, + #[cfg(target_arch = "x86_64")] + extended_features_ebx: u32, + xcr0: u64, +} + +// SAFETY: This unconditionally uses CPUID because we don't have a good +// way to detect CPUID and because we don't know of a CPU that supports +// SSE2 (that we currently statically require) but doesn't support +// CPUID. SGX is one environment where CPUID isn't allowed but where +// SSE2 is statically supported. Ideally there would be a +// `cfg!(target_feature = "cpuid")` we could use. +unsafe fn cpuid_all() -> CpuidSummary { + #[cfg(target_arch = "x86")] + use core::arch::x86 as arch; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64 as arch; - // The `prefixed_extern!` uses below assume this + // MSRV(1.66) avoids miscompilations when calling `__cpuid`; + // see https://github.com/rust-lang/rust/pull/101861. + + // Intel: "21.1.1 Notes on Where to Start". + let r = unsafe { arch::__cpuid(0) }; + + let leaf1_edx; + let leaf1_ecx; + + #[cfg(target_arch = "x86_64")] + let is_intel = (r.ebx == 0x756e6547) && (r.edx == 0x49656e69) && (r.ecx == 0x6c65746e); + + #[cfg(target_arch = "x86_64")] + let (extended_features_ecx, extended_features_ebx); + + if r.eax >= 1 { + // SAFETY: `r.eax >= 1` indicates leaf 1 is available. + let r = unsafe { arch::__cpuid(1) }; + leaf1_edx = r.edx; + leaf1_ecx = r.ecx; + + #[cfg(target_arch = "x86_64")] + if r.eax >= 7 { + // SAFETY: `r.eax >= 7` implies we can execute this. + let r = unsafe { arch::__cpuid(7) }; + extended_features_ecx = r.ecx; + extended_features_ebx = r.ebx; + } else { + extended_features_ecx = 0; + extended_features_ebx = 0; + } + } else { + // Expected to be unreachable on any environment we currently + // support. + leaf1_edx = 0; + leaf1_ecx = 0; + #[cfg(target_arch = "x86_64")] + { + extended_features_ecx = 0; + extended_features_ebx = 0; + } + } + + let xcr0 = if check(leaf1_ecx, 27) { + unsafe { arch::_xgetbv(0) } + } else { + 0 + }; + + CpuidSummary { + #[cfg(target_arch = "x86_64")] + is_intel, + leaf1_edx, + leaf1_ecx, + #[cfg(target_arch = "x86_64")] + extended_features_ecx, + #[cfg(target_arch = "x86_64")] + extended_features_ebx, + xcr0, + } +} + +fn cpuid_to_caps_and_set_c_flags(r: CpuidSummary) -> u32 { #[cfg(target_arch = "x86_64")] use core::{mem::align_of, sync::atomic::AtomicU32}; + + let CpuidSummary { + #[cfg(target_arch = "x86_64")] + is_intel, + leaf1_edx, + leaf1_ecx, + #[cfg(target_arch = "x86_64")] + extended_features_ecx, + #[cfg(target_arch = "x86_64")] + extended_features_ebx, + xcr0, + } = r; + + // The `prefixed_extern!` uses below assume this #[cfg(target_arch = "x86_64")] const _ATOMIC32_ALIGNMENT_EQUALS_U32_ALIGNMENT: () = assert!(align_of::() == align_of::()); - fn check(leaf: u32, bit: u32) -> bool { - let shifted = 1 << bit; - (leaf & shifted) == shifted - } fn set(out: &mut u32, shift: Shift) { let shifted = 1 << (shift as u32); debug_assert_eq!(*out & shifted, 0); @@ -129,16 +215,6 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 { debug_assert_eq!(*out & shifted, shifted); } - #[cfg(target_arch = "x86_64")] - let is_intel = check(cpuid[0], 30); // Synthesized by `OPENSSL_cpuid_setup` - - // CPUID leaf 1. - let leaf1_ecx = cpuid[1]; - - // Intel: "Structured Extended Feature Flags Enumeration Leaf" - #[cfg(target_arch = "x86_64")] - let (extended_features_ebx, extended_features_ecx) = (cpuid[2], cpuid[3]); - let mut caps = 0; // AMD: "Collectively the SSE1, [...] are referred to as the legacy SSE @@ -179,6 +255,7 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 { set(&mut caps, Shift::Sse2); } } + let _ = leaf1_edx; // Sometimes people delete the `_SSE_REQUIRED`/`_SSE2_REQUIRED` const // assertions in an attempt to support pre-SSE2 32-bit x86 systems. If they @@ -192,76 +269,74 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 { // Intel: "12.7.2 Checking for SSSE3 Support" // If/when we support dynamic detection of SSE/SSE2, make this conditional // on SSE/SSE2. + // TODO: Make this conditional on SSE3. if check(leaf1_ecx, 9) { set(&mut caps, Shift::Ssse3); - } - // Intel: "12.12.2 Checking for Intel SSE4.1 Support" - // If/when we support dynamic detection of SSE/SSE2, make this conditional - // on SSE/SSE2. - // XXX: We don't check for SSE3 and we're not sure if it is compatible for - // us to do so; does AMD advertise SSE3? TODO: address this. - // XXX: We don't condition this on SSSE3 being available. TODO: address - // this. - #[cfg(target_arch = "x86_64")] - if check(leaf1_ecx, 19) { - set(&mut caps, Shift::Sse41); + // Intel: "12.12.2 Checking for Intel SSE4.1 Support" + #[cfg(target_arch = "x86_64")] + if check(leaf1_ecx, 19) { + set(&mut caps, Shift::Sse41); + } } // AMD: "The extended SSE instructions include [...]." // Intel: "14.3 DETECTION OF INTEL AVX INSTRUCTIONS" - // `OPENSSL_cpuid_setup` clears this bit when it detects the OS doesn't - // support AVX state. - let avx_available = check(leaf1_ecx, 28); - if avx_available { + let os_supports_xmm_and_ymm = (xcr0 & 6) == 6; + let cpu_supports_avx = check(leaf1_ecx, 28); + + if os_supports_xmm_and_ymm && cpu_supports_avx { set(&mut caps, Shift::Avx); - } - #[cfg(target_arch = "x86_64")] - if avx_available { - // The Intel docs don't seem to document the detection. The instruction - // definitions of the VEX.256 instructions reference the - // VAES/VPCLMULQDQ features and the documentation for the extended - // features gives the values. We combine these into one feature because - // we never use them independently. - let vaes_available = check(extended_features_ecx, 9); - let vclmul_available = check(extended_features_ecx, 10); - if vaes_available && vclmul_available { - set(&mut caps, Shift::VAesClmul); + #[cfg(target_arch = "x86_64")] + { + // The Intel docs don't seem to document the detection. The instruction + // definitions of the VEX.256 instructions reference the + // VAES/VPCLMULQDQ features and the documentation for the extended + // features gives the values. We combine these into one feature because + // we never use them independently. + let vaes_available = check(extended_features_ecx, 9); + let vclmul_available = check(extended_features_ecx, 10); + if vaes_available && vclmul_available { + set(&mut caps, Shift::VAesClmul); + } } - } - // "14.7.1 Detection of Intel AVX2 Hardware support" - // XXX: We don't condition AVX2 on AVX. TODO: Address this. - // `OPENSSL_cpuid_setup` clears this bit when it detects the OS doesn't - // support AVX state. - #[cfg(target_arch = "x86_64")] - if check(extended_features_ebx, 5) { - set(&mut caps, Shift::Avx2); + // "14.7.1 Detection of Intel AVX2 Hardware support" + // XXX: We don't condition AVX2 on AVX. TODO: Address this. + // `OPENSSL_cpuid_setup` clears this bit when it detects the OS doesn't + // support AVX state. + #[cfg(target_arch = "x86_64")] + if check(extended_features_ebx, 5) { + set(&mut caps, Shift::Avx2); - // Declared as `uint32_t` in the C code. - prefixed_extern! { - static avx2_available: AtomicU32; + // Declared as `uint32_t` in the C code. + prefixed_extern! { + static avx2_available: AtomicU32; + } + // SAFETY: The C code only reads `avx2_available`, and its reads are + // synchronized through the `OnceNonZeroUsize` Acquire/Release + // semantics as we ensure we have a `cpu::Features` instance before + // calling into the C code. + let flag = unsafe { &avx2_available }; + flag.store(1, core::sync::atomic::Ordering::Relaxed); } - // SAFETY: The C code only reads `avx2_available`, and its reads are - // synchronized through the `OnceNonZeroUsize` Acquire/Release - // semantics as we ensure we have a `cpu::Features` instance before - // calling into the C code. - let flag = unsafe { &avx2_available }; - flag.store(1, core::sync::atomic::Ordering::Relaxed); } // Intel: "12.13.4 Checking for Intel AES-NI Support" // If/when we support dynamic detection of SSE/SSE2, revisit this. // TODO: Clarify "interesting" states like (!SSE && AVX && AES-NI) - // and AES-NI & !AVX. - // Each check of `ClMul`, `Aes`, and `Sha` must be paired with a check for - // an AVX feature (e.g. `Avx`) or an SSE feature (e.g. `Ssse3`), as every - // use will either be supported by SSE* or AVX* instructions. We then - // assume that those supporting instructions' prerequisites (e.g. OS - // support for AVX or SSE state, respectively) are the only prerequisites - // for these features. + // and (AES-NI & !AVX). + // + // PCLMULQDQ and AES-NI instructions come in P* (SSE) and VP* (AVX) + // variants. The use of the SSE variants must be guarded by a check of both + // the `ClMul`/`Aes` feature AND an SSE (e.g. `Ssse3`) or AVX (e.g. `Avx`) + // feature. Which SSE/AVX feature to check for will depend on the + // supporting instructions around the VPCLMULQDQ/AES-NI constructions. + // (PCLMULQDQ and AES-NI also come additional "VPCLMULQDQ"/"VAES" + // variants, which are a separate thing entirely; support for those will be + // added later.) if check(leaf1_ecx, 1) { set(&mut caps, Shift::ClMul); } @@ -302,7 +377,7 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 { // rust `std::arch::is_x86_feature_detected` does a very similar thing // but only looks at AVX, not ADX. Note that they reference an older // version of the erratum labeled SKL052. - let believe_bmi_bits = !is_intel || (adx_available || avx_available); + let believe_bmi_bits = !is_intel || (adx_available || cpu_supports_avx); if check(extended_features_ebx, 3) && believe_bmi_bits { set(&mut caps, Shift::Bmi1); @@ -330,6 +405,11 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 { caps } +fn check(leaf: u32, bit: u32) -> bool { + let shifted = 1 << bit; + (leaf & shifted) == shifted +} + impl_get_feature! { features: [ { ("x86_64") => VAesClmul },