diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h index 8c82f62138..a2b09a8882 100644 --- a/hwy/ops/x86_256-inl.h +++ b/hwy/ops/x86_256-inl.h @@ -2413,29 +2413,37 @@ HWY_API Vec256 BroadcastSignBit(const Vec256 v) { return ShiftRight<31>(v); } +#if HWY_TARGET <= HWY_AVX3 + +template +HWY_API Vec256 ShiftRight(const Vec256 v) { + return Vec256{ + _mm256_srai_epi64(v.raw, static_cast(kBits))}; +} + +HWY_API Vec256 BroadcastSignBit(const Vec256 v) { + return ShiftRight<63>(v); +} + +#else // AVX2 + +// Unlike above, this will be used to implement int64_t ShiftRight. HWY_API Vec256 BroadcastSignBit(const Vec256 v) { -#if HWY_TARGET == HWY_AVX2 const DFromV d; return VecFromMask(v < Zero(d)); -#else - return Vec256{_mm256_srai_epi64(v.raw, 63)}; -#endif } template HWY_API Vec256 ShiftRight(const Vec256 v) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{ - _mm256_srai_epi64(v.raw, static_cast(kBits))}; -#else const Full256 di; const Full256 du; const auto right = BitCast(di, ShiftRight(BitCast(du, v))); const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v)); return right | sign; -#endif } +#endif // #if HWY_TARGET <= HWY_AVX3 + // ------------------------------ IfNegativeThenElse (BroadcastSignBit) HWY_API Vec256 IfNegativeThenElse(Vec256 v, Vec256 yes, Vec256 no) { @@ -2495,6 +2503,10 @@ HWY_API Vec256 IfNegativeThenNegOrUndefIfZero(Vec256 mask, // ------------------------------ ShiftLeftSame +// Disable sign conversion warnings for GCC debug intrinsics. +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { #if HWY_COMPILER_GCC @@ -2642,6 +2654,8 @@ HWY_API Vec256 ShiftRightSame(Vec256 v, const int bits) { return (shifted ^ shifted_sign) - shifted_sign; } +HWY_DIAGNOSTICS(pop) + // ------------------------------ Neg (Xor, Sub) // Tag dispatch instead of SFINAE for MSVC 2017 compatibility diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h index dc73422860..9ff75bdf7b 100644 --- a/hwy/ops/x86_512-inl.h +++ b/hwy/ops/x86_512-inl.h @@ -1478,7 +1478,11 @@ HWY_API Vec512 Ror(Vec512 a, Vec512 b) { // ------------------------------ ShiftLeftSame // GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512 -// shift-with-immediate: the counts should all be unsigned int. +// shift-with-immediate: the counts should all be unsigned int. Despite casting, +// we still see warnings in GCC debug builds, hence disable. +HWY_DIAGNOSTICS(push) +HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") + #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100 using Shift16Count = int; using Shift3264Count = int; @@ -1642,6 +1646,8 @@ HWY_API Vec512 ShiftRightSame(Vec512 v, const int bits) { return (shifted ^ shifted_sign) - shifted_sign; } +HWY_DIAGNOSTICS(pop) + // ------------------------------ Minimum // Unsigned @@ -2946,11 +2952,28 @@ HWY_API Vec512 BroadcastSignBit(Vec512 v) { // ------------------------------ Floating-point classification (Not) +namespace detail { + +template +__mmask32 Fix_mm512_fpclass_ph_mask(__m512h v) { +#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1500 + // GCC's _mm512_cmp_ph_mask uses `__mmask8` instead of `__mmask32`, hence only + // the first 8 lanes are set. + return static_cast<__mmask32>(__builtin_ia32_fpclassph512_mask( + static_cast<__v32hf>(v), kCategories, static_cast<__mmask32>(-1))); +#else + return _mm512_fpclass_ph_mask(v, kCategories); +#endif +} + +} // namespace detail + #if HWY_HAVE_FLOAT16 || HWY_IDE HWY_API Mask512 IsNaN(Vec512 v) { - return Mask512{_mm512_fpclass_ph_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; + constexpr int kCategories = HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN; + return Mask512{ + detail::Fix_mm512_fpclass_ph_mask(v.raw)}; } HWY_API Mask512 IsEitherNaN(Vec512 a, @@ -2963,15 +2986,18 @@ HWY_API Mask512 IsEitherNaN(Vec512 a, } HWY_API Mask512 IsInf(Vec512 v) { - return Mask512{_mm512_fpclass_ph_mask(v.raw, 0x18)}; + constexpr int kCategories = HWY_X86_FPCLASS_POS_INF | HWY_X86_FPCLASS_NEG_INF; + return Mask512{ + detail::Fix_mm512_fpclass_ph_mask(v.raw)}; } // Returns whether normal/subnormal/zero. fpclass doesn't have a flag for // positive, so we have to check for inf/NaN and negate. HWY_API Mask512 IsFinite(Vec512 v) { - return Not(Mask512{_mm512_fpclass_ph_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | - HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); + constexpr int kCategories = HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | + HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF; + return Not(Mask512{ + detail::Fix_mm512_fpclass_ph_mask(v.raw)}); } #endif // HWY_HAVE_FLOAT16