From c32b4ae96bb96df19f1b059183a5751da98ee684 Mon Sep 17 00:00:00 2001 From: InstLatx64 Date: Fri, 12 Jul 2024 11:42:12 +0200 Subject: [PATCH 01/12] fix AMX sample -- Similar to GATHER instructions, AMX-INT8 and AMX-BF16 instructions cannot have the same operand more than once Intel SDM 325462-084US, p. 585 Checked with XED version: [v2024.04.01] --- test/amx.asm | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/test/amx.asm b/test/amx.asm index 88455508d..6ee3b28b6 100644 --- a/test/amx.asm +++ b/test/amx.asm @@ -1,7 +1,9 @@ bits 64 -%macro amx 1 +%macro amx 3 %define treg tmm %+ %1 + %define treg2 tmm %+ %2 + %define treg3 tmm %+ %3 ldtilecfg [rsi] sttilecfg [rdi] @@ -16,11 +18,11 @@ tileloaddt1 treg, [rax,rdx] tileloaddt1 treg, [rax,rdx*2] - tdpbf16ps treg, treg, treg - tdpbssd treg, treg, treg - tdpbusd treg, treg, treg - tdpbsud treg, treg, treg - tdpbuud treg, treg, treg + tdpbf16ps treg, treg2, treg3 + tdpbssd treg, treg2, treg3 + tdpbusd treg, treg2, treg3 + tdpbsud treg, treg2, treg3 + tdpbuud treg, treg2, treg3 tilestored [rax], treg tilestored [rax,rdx], treg @@ -30,7 +32,11 @@ %endmacro %assign n 0 +%assign m 1 +%assign l 2 %rep 8 - amx n - %assign n n+1 + amx n, m, l + %assign n ((n+1) % 8) + %assign m ((m+1) % 8) + %assign l ((l+1) % 8) %endrep From 5ee33d0ac8a4d35332abd96332a0c4dba6b0dd3a Mon Sep 17 00:00:00 2001 From: InstLatx64 Date: Fri, 12 Jul 2024 11:54:56 +0200 Subject: [PATCH 02/12] AMX-FP16 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit -- According to Intel® Architecture Instruction Set Extensions and Future Features 319433-053, p. 158 Checked with XED version: [v2024.04.01] --- test/amx.asm | 1 + x86/iflags.ph | 1 + x86/insns.dat | 1 + 3 files changed, 3 insertions(+) diff --git a/test/amx.asm b/test/amx.asm index 6ee3b28b6..fa52d55e6 100644 --- a/test/amx.asm +++ b/test/amx.asm @@ -23,6 +23,7 @@ tdpbusd treg, treg2, treg3 tdpbsud treg, treg2, treg3 tdpbuud treg, treg2, treg3 + tdpfp16ps treg, treg2, treg3 tilestored [rax], treg tilestored [rax,rdx], treg diff --git a/x86/iflags.ph b/x86/iflags.ph index 73a115f89..389471e5d 100644 --- a/x86/iflags.ph +++ b/x86/iflags.ph @@ -115,6 +115,7 @@ if_("SMAP", "Supervisor Mode Access Prevention (SMAP)"); if_("SHA512", "SHA512 instructions"); if_("SM3", "SM3 instructions"); if_("SM4", "SM4 instructions"); +if_("AMXFP16", "AMX float16 multiplication"); # Put these last to minimize their relevance if_("OBSOLETE", "Instruction removed from architecture"); diff --git a/x86/insns.dat b/x86/insns.dat index 1337869cf..345f3f3af 100644 --- a/x86/insns.dat +++ b/x86/insns.dat @@ -6095,6 +6095,7 @@ VP2INTERSECTD kreg|rs2,zmmreg,zmmrm128|b32 [rvm:fv: evex.nds.512.f2.0f38.w0 68 LDTILECFG mem512 [m: vex.128.np.0f38.w0 49 /0] AMXTILE,FUTURE,SZ,LONG STTILECFG mem512 [m: vex.128.66.0f38.w0 49 /0] AMXTILE,FUTURE,SZ,LONG TDPBF16PS tmmreg,tmmreg,tmmreg [rmv: vex.128.f3.0f38.w0 5c /r] AMXBF16,FUTURE,LONG +TDPFP16PS tmmreg,tmmreg,tmmreg [rmv: vex.128.f2.0f38.w0 5c /r] AMXFP16,FUTURE,LONG TDPBSSD tmmreg,tmmreg,tmmreg [rmv: vex.128.f2.0f38.w0 5e /r] AMXINT8,FUTURE,LONG TDPBSUD tmmreg,tmmreg,tmmreg [rmv: vex.128.f3.0f38.w0 5e /r] AMXINT8,FUTURE,LONG TDPBUSD tmmreg,tmmreg,tmmreg [rmv: vex.128.66.0f38.w0 5e /r] AMXINT8,FUTURE,LONG From afef16b7808f8a186321e565af3876a57a424885 Mon Sep 17 00:00:00 2001 From: InstLatx64 Date: Fri, 12 Jul 2024 12:03:00 +0200 Subject: [PATCH 03/12] AMX-COMPLEX support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit -- According to Intel® Architecture Instruction Set Extensions and Future Features 319433-053, p. 155 Checked with XED version: [v2024.04.01] --- test/amx.asm | 2 ++ x86/iflags.ph | 1 + x86/insns.dat | 2 ++ 3 files changed, 5 insertions(+) diff --git a/test/amx.asm b/test/amx.asm index fa52d55e6..d4cc906ca 100644 --- a/test/amx.asm +++ b/test/amx.asm @@ -24,6 +24,8 @@ tdpbsud treg, treg2, treg3 tdpbuud treg, treg2, treg3 tdpfp16ps treg, treg2, treg3 + tcmmimfp16ps treg, treg2, treg3 + tcmmrlfp16ps treg, treg2, treg3 tilestored [rax], treg tilestored [rax,rdx], treg diff --git a/x86/iflags.ph b/x86/iflags.ph index 389471e5d..6309350f5 100644 --- a/x86/iflags.ph +++ b/x86/iflags.ph @@ -116,6 +116,7 @@ if_("SHA512", "SHA512 instructions"); if_("SM3", "SM3 instructions"); if_("SM4", "SM4 instructions"); if_("AMXFP16", "AMX float16 multiplication"); +if_("AMXCOMPLEX", "AMX float16 complex multiplication"); # Put these last to minimize their relevance if_("OBSOLETE", "Instruction removed from architecture"); diff --git a/x86/insns.dat b/x86/insns.dat index 345f3f3af..906206522 100644 --- a/x86/insns.dat +++ b/x86/insns.dat @@ -6096,6 +6096,8 @@ LDTILECFG mem512 [m: vex.128.np.0f38.w0 49 /0] AMXTILE,FUTURE,SZ,LONG STTILECFG mem512 [m: vex.128.66.0f38.w0 49 /0] AMXTILE,FUTURE,SZ,LONG TDPBF16PS tmmreg,tmmreg,tmmreg [rmv: vex.128.f3.0f38.w0 5c /r] AMXBF16,FUTURE,LONG TDPFP16PS tmmreg,tmmreg,tmmreg [rmv: vex.128.f2.0f38.w0 5c /r] AMXFP16,FUTURE,LONG +TCMMIMFP16PS tmmreg,tmmreg,tmmreg [rmv: vex.128.66.0f38.w0 6c /r] AMXCOMPLEX,FUTURE,LONG +TCMMRLFP16PS tmmreg,tmmreg,tmmreg [rmv: vex.128.np.0f38.w0 6c /r] AMXCOMPLEX,FUTURE,LONG TDPBSSD tmmreg,tmmreg,tmmreg [rmv: vex.128.f2.0f38.w0 5e /r] AMXINT8,FUTURE,LONG TDPBSUD tmmreg,tmmreg,tmmreg [rmv: vex.128.f3.0f38.w0 5e /r] AMXINT8,FUTURE,LONG TDPBUSD tmmreg,tmmreg,tmmreg [rmv: vex.128.66.0f38.w0 5e /r] AMXINT8,FUTURE,LONG From e7d727cf832065df58dbeb24533375e85d2d42d0 Mon Sep 17 00:00:00 2001 From: InstLatx64 Date: Fri, 12 Jul 2024 12:40:33 +0200 Subject: [PATCH 04/12] fix: AVX-NE-CONVERT instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - According to Intel® Architecture Instruction Set Extensions and Future Features 319433-053, p. 112-118: -- VBCSTNEBF16PS -> VBCSTNEBF162PS -- VCVTNEPS2BF16 target always xmm -- no AVX512 EVEX version, so no LATEVEX -- avx-ne-convert.asm, avx-ne-convert-64.asm test files Checked with XED version: [v2024.04.01] --- test/avx-ne-convert-64.asm | 20 ++++++++++++++++++++ test/avx-ne-convert.asm | 17 +++++++++++++++++ x86/insns.dat | 29 ++++++++++++++--------------- 3 files changed, 51 insertions(+), 15 deletions(-) create mode 100644 test/avx-ne-convert-64.asm create mode 100644 test/avx-ne-convert.asm diff --git a/test/avx-ne-convert-64.asm b/test/avx-ne-convert-64.asm new file mode 100644 index 000000000..1d38ed0cd --- /dev/null +++ b/test/avx-ne-convert-64.asm @@ -0,0 +1,20 @@ +BITS 64 + vbcstnebf162ps xmm1, [rax] + vbcstnebf162ps ymm1, [rax] + vbcstnesh2ps xmm1, [rax] + vbcstnesh2ps ymm1, [rax] + vcvtneebf162ps xmm1, oword [rbx] + vcvtneebf162ps ymm1, yword [rcx] + vcvtneeph2ps xmm1, oword [rbx] + vcvtneeph2ps ymm1, yword [rcx] + vcvtneobf162ps xmm1, oword [rbx] + vcvtneobf162ps ymm1, yword [rcx] + vcvtneoph2ps xmm1, oword [rbx] + vcvtneoph2ps ymm1, yword [rcx] + vcvtneps2bf16 xmm1, xmm2 + vcvtneps2bf16 xmm1, ymm2 + vcvtneps2bf16 xmm1, oword [rbx] + vcvtneps2bf16 xmm1, yword [rbx] + + + diff --git a/test/avx-ne-convert.asm b/test/avx-ne-convert.asm new file mode 100644 index 000000000..fd99f2bf4 --- /dev/null +++ b/test/avx-ne-convert.asm @@ -0,0 +1,17 @@ +BITS 32 + vbcstnebf162ps xmm1, [eax] + vbcstnebf162ps ymm1, [eax] + vbcstnesh2ps xmm1, [eax] + vbcstnesh2ps ymm1, [eax] + vcvtneebf162ps xmm1, oword [ebx] + vcvtneebf162ps ymm1, yword [ecx] + vcvtneeph2ps xmm1, oword [ebx] + vcvtneeph2ps ymm1, yword [ecx] + vcvtneobf162ps xmm1, oword [ebx] + vcvtneobf162ps ymm1, yword [ecx] + vcvtneoph2ps xmm1, oword [ebx] + vcvtneoph2ps ymm1, yword [ecx] + vcvtneps2bf16 xmm1, xmm2 + vcvtneps2bf16 xmm1, ymm2 + vcvtneps2bf16 xmm1, oword [ebx] + vcvtneps2bf16 xmm1, yword [ebx] diff --git a/x86/insns.dat b/x86/insns.dat index 906206522..41e7ae2b4 100644 --- a/x86/insns.dat +++ b/x86/insns.dat @@ -3603,21 +3603,20 @@ VSM4RNDS4 xmmreg,xmmreg,xmmrm128 [rvm: vex.nds.128.f2.0f38.w0 VSM4RNDS4 ymmreg,ymmreg,ymmrm128 [rvm: vex.nds.256.f2.0f38.w0 da /r] SM4,AVX,FUTURE ;# AVX no exception conversions -; Must precede AVX-512 versions -VBCSTNEBF16PS xmmreg,mem16 [rm: vex.128.f3.0f38.w0 b1 /r] AVXNECONVERT,FUTURE,LATEVEX,SW -VBCSTNEBF16PS ymmreg,mem16 [rm: vex.256.f3.0f38.w0 b1 /r] AVXNECONVERT,FUTURE,LATEVEX,SW -VBCSTNESH2PS xmmreg,mem16 [rm: vex.128.66.0f38.w0 b1 /r] AVXNECONVERT,FUTURE,LATEVEX,SW -VBCSTNESH2PS ymmreg,mem16 [rm: vex.256.66.0f38.w0 b1 /r] AVXNECONVERT,FUTURE,LATEVEX,SW -VCVTNEEBF162PS xmmreg,mem128 [rm: vex.128.f3.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,LATEVEX,SX -VCVTNEEBF162PS ymmreg,mem256 [rm: vex.256.f3.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,LATEVEX,SY -VCVTNEEPH2PS xmmreg,mem128 [rm: vex.128.66.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,LATEVEX,SX -VCVTNEEPH2PS ymmreg,mem256 [rm: vex.256.66.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,LATEVEX,SY -VCVTNEOBF162PS xmmreg,mem128 [rm: vex.128.f2.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,LATEVEX,SX -VCVTNEOBF162PS ymmreg,mem256 [rm: vex.256.f2.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,LATEVEX,SY -VCVTNEOPH2PS xmmreg,mem128 [rm: vex.128.np.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,LATEVEX,SX -VCVTNEOPH2PS ymmreg,mem256 [rm: vex.256.np.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,LATEVEX,SY -VCVTNEPS2BF16 xmmreg,xmmrm128 [rm: vex.128.f3.0f38.w0 72 /r] AVXNECONVERT,FUTURE,LATEVEX,SX -VCVTNEPS2BF16 ymmreg,ymmrm256 [rm: vex.256.f3.0f38.w0 72 /r] AVXNECONVERT,FUTURE,LATEVEX,SY +VBCSTNEBF162PS xmmreg,mem16 [rm: vex.128.f3.0f38.w0 b1 /r] AVXNECONVERT,FUTURE,AVX,SW +VBCSTNEBF162PS ymmreg,mem16 [rm: vex.256.f3.0f38.w0 b1 /r] AVXNECONVERT,FUTURE,AVX,SW +VBCSTNESH2PS xmmreg,mem16 [rm: vex.128.66.0f38.w0 b1 /r] AVXNECONVERT,FUTURE,AVX,SW +VBCSTNESH2PS ymmreg,mem16 [rm: vex.256.66.0f38.w0 b1 /r] AVXNECONVERT,FUTURE,AVX,SW +VCVTNEEBF162PS xmmreg,mem128 [rm: vex.128.f3.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX,SX +VCVTNEEBF162PS ymmreg,mem256 [rm: vex.256.f3.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX,SY +VCVTNEEPH2PS xmmreg,mem128 [rm: vex.128.66.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX,SX +VCVTNEEPH2PS ymmreg,mem256 [rm: vex.256.66.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX,SY +VCVTNEOBF162PS xmmreg,mem128 [rm: vex.128.f2.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX,SX +VCVTNEOBF162PS ymmreg,mem256 [rm: vex.256.f2.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX,SY +VCVTNEOPH2PS xmmreg,mem128 [rm: vex.128.np.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX,SX +VCVTNEOPH2PS ymmreg,mem256 [rm: vex.256.np.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX,SY +VCVTNEPS2BF16 xmmreg,xmmrm128 [rm: vex.128.f3.0f38.w0 72 /r] AVXNECONVERT,FUTURE,AVX,SX +VCVTNEPS2BF16 xmmreg,ymmrm256 [rm: vex.256.f3.0f38.w0 72 /r] AVXNECONVERT,FUTURE,AVX,SY ;# AVX Vector Neural Network Instructions ; Must precede AVX-512 versions From 10f1d6fd782ce34e1da9ca5e95e1fabc54d2b83d Mon Sep 17 00:00:00 2001 From: InstLatx64 Date: Fri, 12 Jul 2024 13:00:43 +0200 Subject: [PATCH 05/12] fix: AVX-VNNI-8 instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - According to Intel® Architecture Instruction Set Extensions and Future Features 319433-053, p. 120: -- no AVX512 EVEX version, so no LATEVEX -- avx-vnni-int8.asm, avx-vnni-int8-64.asm test files -- unsized operands allowed Checked with XED version: [v2024.04.01] --- test/avx-vnni-int8-64.asm | 62 +++++++++++++++++++++++++++++++++++++++ test/avx-vnni-int8.asm | 62 +++++++++++++++++++++++++++++++++++++++ x86/insns.dat | 27 ++++++++--------- 3 files changed, 137 insertions(+), 14 deletions(-) create mode 100644 test/avx-vnni-int8-64.asm create mode 100644 test/avx-vnni-int8.asm diff --git a/test/avx-vnni-int8-64.asm b/test/avx-vnni-int8-64.asm new file mode 100644 index 000000000..4a90d99c9 --- /dev/null +++ b/test/avx-vnni-int8-64.asm @@ -0,0 +1,62 @@ +BITS 64 + vpdpbsud xmm1, xmm2, xmm0 + vpdpbsud xmm2, xmm3, [rax] + vpdpbsud xmm3, xmm14, oword [rax+0x12] + vpdpbsud xmm14, xmm5, [rax+rbx*2] + + vpdpbsud ymm1, ymm2, ymm0 + vpdpbsud ymm2, ymm3, [rax] + vpdpbsud ymm3, ymm14, yword [rax+0x12] + vpdpbsud ymm14, ymm5, [rax+rbx*2] + + vpdpbsuds xmm1, xmm2, xmm0 + vpdpbsuds xmm2, xmm3, [rax] + vpdpbsuds xmm3, xmm14, [rax+0x12] + vpdpbsuds xmm14, xmm5, [rax+rbx*2] + + vpdpbsuds ymm1, ymm2, ymm0 + vpdpbsuds ymm2, ymm3, [rax] + vpdpbsuds ymm3, ymm14, [rax+0x12] + vpdpbsuds ymm14, ymm5, [rax+rbx*2] + + vpdpbssd xmm1, xmm2, xmm0 + vpdpbssd xmm2, xmm3, [rax] + vpdpbssd xmm3, xmm14, [rax+0x12] + vpdpbssd xmm14, xmm5, [rax+rbx*2] + + vpdpbssd ymm1, ymm2, ymm0 + vpdpbssd ymm2, ymm3, [rax] + vpdpbssd ymm3, ymm14, [rax+0x12] + vpdpbssd ymm14, ymm5, [rax+rbx*2] + + vpdpbssds xmm1, xmm2, xmm0 + vpdpbssds xmm2, xmm3, [rax] + vpdpbssds xmm3, xmm14, [rax+0x12] + vpdpbssds xmm14, xmm5, [rax+rbx*2] + + vpdpbssds ymm1, ymm2, ymm0 + vpdpbssds ymm2, ymm3, [rax] + vpdpbssds ymm3, ymm14, [rax+0x12] + vpdpbssds ymm14, ymm5, [rax+rbx*2] + + vpdpbuud xmm1, xmm2, xmm0 + vpdpbuud xmm2, xmm3, [rax] + vpdpbuud xmm3, xmm14, [rax+0x12] + vpdpbuud xmm14, xmm5, [rax+rbx*2] + + vpdpbuud ymm1, ymm2, ymm0 + vpdpbuud ymm2, ymm3, [rax] + vpdpbuud ymm3, ymm14, [rax+0x12] + vpdpbuud ymm14, ymm5, [rax+rbx*2] + + vpdpbuuds xmm1, xmm2, xmm0 + vpdpbuuds xmm2, xmm3, [rax] + vpdpbuuds xmm3, xmm14, [rax+0x12] + vpdpbuuds xmm14, xmm5, [rax+rbx*2] + + vpdpbuuds ymm1, ymm2, ymm0 + vpdpbuuds ymm2, ymm3, [rax] + vpdpbuuds ymm3, ymm14, [rax+0x12] + vpdpbuuds ymm14, ymm5, [rax+rbx*2] + + diff --git a/test/avx-vnni-int8.asm b/test/avx-vnni-int8.asm new file mode 100644 index 000000000..3d5a5258e --- /dev/null +++ b/test/avx-vnni-int8.asm @@ -0,0 +1,62 @@ +BITS 32 + vpdpbsud xmm1, xmm2, xmm0 + vpdpbsud xmm2, xmm3, [eax] + vpdpbsud xmm3, xmm4, [eax+0x12] + vpdpbsud xmm4, xmm5, [eax+ebx*2] + + vpdpbsud ymm1, ymm2, ymm0 + vpdpbsud ymm2, ymm3, [eax] + vpdpbsud ymm3, ymm4, [eax+0x12] + vpdpbsud ymm4, ymm5, [eax+ebx*2] + + vpdpbsuds xmm1, xmm2, xmm0 + vpdpbsuds xmm2, xmm3, [eax] + vpdpbsuds xmm3, xmm4, [eax+0x12] + vpdpbsuds xmm4, xmm5, [eax+ebx*2] + + vpdpbsuds ymm1, ymm2, ymm0 + vpdpbsuds ymm2, ymm3, [eax] + vpdpbsuds ymm3, ymm4, [eax+0x12] + vpdpbsuds ymm4, ymm5, [eax+ebx*2] + + vpdpbssd xmm1, xmm2, xmm0 + vpdpbssd xmm2, xmm3, [eax] + vpdpbssd xmm3, xmm4, [eax+0x12] + vpdpbssd xmm4, xmm5, [eax+ebx*2] + + vpdpbssd ymm1, ymm2, ymm0 + vpdpbssd ymm2, ymm3, [eax] + vpdpbssd ymm3, ymm4, [eax+0x12] + vpdpbssd ymm4, ymm5, [eax+ebx*2] + + vpdpbssds xmm1, xmm2, xmm0 + vpdpbssds xmm2, xmm3, [eax] + vpdpbssds xmm3, xmm4, [eax+0x12] + vpdpbssds xmm4, xmm5, [eax+ebx*2] + + vpdpbssds ymm1, ymm2, ymm0 + vpdpbssds ymm2, ymm3, [eax] + vpdpbssds ymm3, ymm4, [eax+0x12] + vpdpbssds ymm4, ymm5, [eax+ebx*2] + + vpdpbuud xmm1, xmm2, xmm0 + vpdpbuud xmm2, xmm3, [eax] + vpdpbuud xmm3, xmm4, [eax+0x12] + vpdpbuud xmm4, xmm5, [eax+ebx*2] + + vpdpbuud ymm1, ymm2, ymm0 + vpdpbuud ymm2, ymm3, [eax] + vpdpbuud ymm3, ymm4, [eax+0x12] + vpdpbuud ymm4, ymm5, [eax+ebx*2] + + vpdpbuuds xmm1, xmm2, xmm0 + vpdpbuuds xmm2, xmm3, [eax] + vpdpbuuds xmm3, xmm4, [eax+0x12] + vpdpbuuds xmm4, xmm5, [eax+ebx*2] + + vpdpbuuds ymm1, ymm2, ymm0 + vpdpbuuds ymm2, ymm3, [eax] + vpdpbuuds ymm3, ymm4, [eax+0x12] + vpdpbuuds ymm4, ymm5, [eax+ebx*2] + + diff --git a/x86/insns.dat b/x86/insns.dat index 41e7ae2b4..607634b05 100644 --- a/x86/insns.dat +++ b/x86/insns.dat @@ -3618,20 +3618,19 @@ VCVTNEOPH2PS ymmreg,mem256 [rm: vex.256.np.0f38.w0 b0 /r] AVXNECONVERT,FUTURE VCVTNEPS2BF16 xmmreg,xmmrm128 [rm: vex.128.f3.0f38.w0 72 /r] AVXNECONVERT,FUTURE,AVX,SX VCVTNEPS2BF16 xmmreg,ymmrm256 [rm: vex.256.f3.0f38.w0 72 /r] AVXNECONVERT,FUTURE,AVX,SY -;# AVX Vector Neural Network Instructions -; Must precede AVX-512 versions -VPDPBSSD xmmreg,xmmreg,xmmrm128 [rvm: vex.128.f2.0f38.w0 50 /r] AVXVNNIINT8,FUTURE,LATEVEX,SX -VPDPBSSD ymmreg,ymmreg,ymmrm256 [rvm: vex.256.f2.0f38.w0 50 /r] AVXVNNIINT8,FUTURE,LATEVEX,SY -VPDPBSSDS xmmreg,xmmreg,xmmrm128 [rvm: vex.128.f2.0f38.w0 51 /r] AVXVNNIINT8,FUTURE,LATEVEX,SX -VPDPBSSDS ymmreg,ymmreg,ymmrm256 [rvm: vex.256.f2.0f38.w0 51 /r] AVXVNNIINT8,FUTURE,LATEVEX,SY -VPDPBSUD xmmreg,xmmreg,xmmrm128 [rvm: vex.128.f3.0f38.w0 50 /r] AVXVNNIINT8,FUTURE,LATEVEX,SX -VPDPBSUD ymmreg,ymmreg,ymmrm256 [rvm: vex.256.f3.0f38.w0 50 /r] AVXVNNIINT8,FUTURE,LATEVEX,SY -VPDPBSUDS xmmreg,xmmreg,xmmrm128 [rvm: vex.128.f3.0f38.w0 51 /r] AVXVNNIINT8,FUTURE,LATEVEX,SX -VPDPBSUDS ymmreg,ymmreg,ymmrm256 [rvm: vex.256.f3.0f38.w0 51 /r] AVXVNNIINT8,FUTURE,LATEVEX,SY -VPDPBUUD xmmreg,xmmreg,xmmrm128 [rvm: vex.128.np.0f38.w0 50 /r] AVXVNNIINT8,FUTURE,LATEVEX,SX -VPDPBUUD ymmreg,ymmreg,ymmrm256 [rvm: vex.256.np.0f38.w0 50 /r] AVXVNNIINT8,FUTURE,LATEVEX,SY -VPDPBUUDS xmmreg,xmmreg,xmmrm128 [rvm: vex.128.np.0f38.w0 51 /r] AVXVNNIINT8,FUTURE,LATEVEX,SX -VPDPBUUDS ymmreg,ymmreg,ymmrm256 [rvm: vex.256.np.0f38.w0 51 /r] AVXVNNIINT8,FUTURE,LATEVEX,SY +;# AVX Vector Neural Network Instructions INT8 +VPDPBSSD xmmreg,xmmreg,xmmrm128 [rvm: vex.128.f2.0f38.w0 50 /r] AVXVNNIINT8,FUTURE,AVX +VPDPBSSD ymmreg,ymmreg,ymmrm256 [rvm: vex.256.f2.0f38.w0 50 /r] AVXVNNIINT8,FUTURE,AVX +VPDPBSSDS xmmreg,xmmreg,xmmrm128 [rvm: vex.128.f2.0f38.w0 51 /r] AVXVNNIINT8,FUTURE,AVX +VPDPBSSDS ymmreg,ymmreg,ymmrm256 [rvm: vex.256.f2.0f38.w0 51 /r] AVXVNNIINT8,FUTURE,AVX +VPDPBSUD xmmreg,xmmreg,xmmrm128 [rvm: vex.128.f3.0f38.w0 50 /r] AVXVNNIINT8,FUTURE,AVX +VPDPBSUD ymmreg,ymmreg,ymmrm256 [rvm: vex.256.f3.0f38.w0 50 /r] AVXVNNIINT8,FUTURE,AVX +VPDPBSUDS xmmreg,xmmreg,xmmrm128 [rvm: vex.128.f3.0f38.w0 51 /r] AVXVNNIINT8,FUTURE,AVX +VPDPBSUDS ymmreg,ymmreg,ymmrm256 [rvm: vex.256.f3.0f38.w0 51 /r] AVXVNNIINT8,FUTURE,AVX +VPDPBUUD xmmreg,xmmreg,xmmrm128 [rvm: vex.128.np.0f38.w0 50 /r] AVXVNNIINT8,FUTURE,AVX +VPDPBUUD ymmreg,ymmreg,ymmrm256 [rvm: vex.256.np.0f38.w0 50 /r] AVXVNNIINT8,FUTURE,AVX +VPDPBUUDS xmmreg,xmmreg,xmmrm128 [rvm: vex.128.np.0f38.w0 51 /r] AVXVNNIINT8,FUTURE,AVX +VPDPBUUDS ymmreg,ymmreg,ymmrm256 [rvm: vex.256.np.0f38.w0 51 /r] AVXVNNIINT8,FUTURE,AVX ;# AVX Integer Fused Multiply-Add ; Must precede AVX-512 versions From 0888098fa68600f9f862560e1f91882defeb159c Mon Sep 17 00:00:00 2001 From: InstLatx64 Date: Fri, 12 Jul 2024 14:20:12 +0200 Subject: [PATCH 06/12] AVX-VNNI support - According to Intel SDM 325462-084US, p. 2413-2419 -- VEX encoded version of the AVX512_VNNI xmm, ymm instructions -- there is AVX512 EVEX version, so latevex flag required -- avx-vnni.asm, avx-vnni-64.asm test files -- unsized operands allowed Checked with XED version: [v2024.04.01] --- test/avx-vnni-64.asm | 41 +++++++++++++++++++++++++++++++++++++++++ test/avx-vnni.asm | 41 +++++++++++++++++++++++++++++++++++++++++ x86/iflags.ph | 1 + x86/insns.dat | 11 +++++++++++ 4 files changed, 94 insertions(+) create mode 100644 test/avx-vnni-64.asm create mode 100644 test/avx-vnni.asm diff --git a/test/avx-vnni-64.asm b/test/avx-vnni-64.asm new file mode 100644 index 000000000..e7dfd8357 --- /dev/null +++ b/test/avx-vnni-64.asm @@ -0,0 +1,41 @@ +BITS 64 + cpu latevex + vpdpbusd xmm1, xmm2, xmm0 + vpdpbusd xmm2, xmm3, [rax] + vpdpbusd xmm3, xmm4, [rax+0x12] + vpdpbusd xmm4, xmm5, [rax+rbx*2] + + vpdpbusd ymm1, ymm2, ymm0 + vpdpbusd ymm2, ymm3, [rax] + vpdpbusd ymm3, ymm4, [rax+0x12] + vpdpbusd ymm4, ymm5, [rax+rbx*2] + + vpdpbusds xmm1, xmm2, xmm0 + vpdpbusds xmm2, xmm3, [rax] + vpdpbusds xmm3, xmm4, [rax+0x12] + vpdpbusds xmm4, xmm5, [rax+rbx*2] + + vpdpbusds ymm1, ymm2, ymm0 + vpdpbusds ymm2, ymm3, [rax] + vpdpbusds ymm3, ymm4, [rax+0x12] + vpdpbusds ymm4, ymm5, [rax+rbx*2] + + vpdpwssd xmm1, xmm2, xmm0 + vpdpwssd xmm2, xmm3, [rax] + vpdpwssd xmm3, xmm4, [rax+0x12] + vpdpwssd xmm4, xmm5, [rax+rbx*2] + + vpdpwssd ymm1, ymm2, ymm0 + vpdpwssd ymm2, ymm3, [rax] + vpdpwssd ymm3, ymm4, [rax+0x12] + vpdpwssd ymm4, ymm5, [rax+rbx*2] + + vpdpwssds xmm1, xmm2, xmm0 + vpdpwssds xmm2, xmm3, [rax] + vpdpwssds xmm3, xmm4, [rax+0x12] + vpdpwssds xmm4, xmm5, [rax+rbx*2] + + vpdpwssds ymm1, ymm2, ymm0 + vpdpwssds ymm2, ymm3, [rax] + vpdpwssds ymm3, ymm4, [rax+0x12] + vpdpwssds ymm4, ymm5, [rax+rbx*2] diff --git a/test/avx-vnni.asm b/test/avx-vnni.asm new file mode 100644 index 000000000..0965814e2 --- /dev/null +++ b/test/avx-vnni.asm @@ -0,0 +1,41 @@ +BITS 32 + cpu latevex + vpdpbusd xmm1, xmm2, xmm0 + vpdpbusd xmm2, xmm3, [eax] + vpdpbusd xmm3, xmm4, [eax+0x12] + vpdpbusd xmm4, xmm5, [eax+ebx*2] + + vpdpbusd ymm1, ymm2, ymm0 + vpdpbusd ymm2, ymm3, [eax] + vpdpbusd ymm3, ymm4, [eax+0x12] + vpdpbusd ymm4, ymm5, [eax+ebx*2] + + vpdpbusds xmm1, xmm2, xmm0 + vpdpbusds xmm2, xmm3, [eax] + vpdpbusds xmm3, xmm4, [eax+0x12] + vpdpbusds xmm4, xmm5, [eax+ebx*2] + + vpdpbusds ymm1, ymm2, ymm0 + vpdpbusds ymm2, ymm3, [eax] + vpdpbusds ymm3, ymm4, [eax+0x12] + vpdpbusds ymm4, ymm5, [eax+ebx*2] + + vpdpwssd xmm1, xmm2, xmm0 + vpdpwssd xmm2, xmm3, [eax] + vpdpwssd xmm3, xmm4, [eax+0x12] + vpdpwssd xmm4, xmm5, [eax+ebx*2] + + vpdpwssd ymm1, ymm2, ymm0 + vpdpwssd ymm2, ymm3, [eax] + vpdpwssd ymm3, ymm4, [eax+0x12] + vpdpwssd ymm4, ymm5, [eax+ebx*2] + + vpdpwssds xmm1, xmm2, xmm0 + vpdpwssds xmm2, xmm3, [eax] + vpdpwssds xmm3, xmm4, [eax+0x12] + vpdpwssds xmm4, xmm5, [eax+ebx*2] + + vpdpwssds ymm1, ymm2, ymm0 + vpdpwssds ymm2, ymm3, [eax] + vpdpwssds ymm3, ymm4, [eax+0x12] + vpdpwssds ymm4, ymm5, [eax+ebx*2] diff --git a/x86/iflags.ph b/x86/iflags.ph index 6309350f5..f8d331244 100644 --- a/x86/iflags.ph +++ b/x86/iflags.ph @@ -117,6 +117,7 @@ if_("SM3", "SM3 instructions"); if_("SM4", "SM4 instructions"); if_("AMXFP16", "AMX float16 multiplication"); if_("AMXCOMPLEX", "AMX float16 complex multiplication"); +if_("AVXVNNI", "AVX Vector Neural Network instructions"); # Put these last to minimize their relevance if_("OBSOLETE", "Instruction removed from architecture"); diff --git a/x86/insns.dat b/x86/insns.dat index 607634b05..9bb5ddc10 100644 --- a/x86/insns.dat +++ b/x86/insns.dat @@ -3618,6 +3618,17 @@ VCVTNEOPH2PS ymmreg,mem256 [rm: vex.256.np.0f38.w0 b0 /r] AVXNECONVERT,FUTURE VCVTNEPS2BF16 xmmreg,xmmrm128 [rm: vex.128.f3.0f38.w0 72 /r] AVXNECONVERT,FUTURE,AVX,SX VCVTNEPS2BF16 xmmreg,ymmrm256 [rm: vex.256.f3.0f38.w0 72 /r] AVXNECONVERT,FUTURE,AVX,SY +;# AVX Vector Neural Network Instructions +; Must precede AVX-512 versions +VPDPBUSD xmmreg,xmmreg,xmmrm128 [rvm: vex.128.66.0f38.w0 50 /r] AVXVNNI,FUTURE,LATEVEX +VPDPBUSD ymmreg,ymmreg,ymmrm256 [rvm: vex.256.66.0f38.w0 50 /r] AVXVNNI,FUTURE,LATEVEX +VPDPBUSDS xmmreg,xmmreg,xmmrm128 [rvm: vex.128.66.0f38.w0 51 /r] AVXVNNI,FUTURE,LATEVEX +VPDPBUSDS ymmreg,ymmreg,ymmrm256 [rvm: vex.256.66.0f38.w0 51 /r] AVXVNNI,FUTURE,LATEVEX +VPDPWSSD xmmreg,xmmreg,xmmrm128 [rvm: vex.128.66.0f38.w0 52 /r] AVXVNNI,FUTURE,LATEVEX +VPDPWSSD ymmreg,ymmreg,ymmrm256 [rvm: vex.256.66.0f38.w0 52 /r] AVXVNNI,FUTURE,LATEVEX +VPDPWSSDS xmmreg,xmmreg,xmmrm128 [rvm: vex.128.66.0f38.w0 53 /r] AVXVNNI,FUTURE,LATEVEX +VPDPWSSDS ymmreg,ymmreg,ymmrm256 [rvm: vex.256.66.0f38.w0 53 /r] AVXVNNI,FUTURE,LATEVEX + ;# AVX Vector Neural Network Instructions INT8 VPDPBSSD xmmreg,xmmreg,xmmrm128 [rvm: vex.128.f2.0f38.w0 50 /r] AVXVNNIINT8,FUTURE,AVX VPDPBSSD ymmreg,ymmreg,ymmrm256 [rvm: vex.256.f2.0f38.w0 50 /r] AVXVNNIINT8,FUTURE,AVX From 4e49f599a8d14c960562be18c88352059137a431 Mon Sep 17 00:00:00 2001 From: InstLatx64 Date: Fri, 12 Jul 2024 14:21:59 +0200 Subject: [PATCH 07/12] fix: AVX-IFMA unsized operands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - According to Intel® Architecture Instruction Set Extensions and Future Features 319433-053, p. 126-127: -- there is AVX512 EVEX version, so latevex flag required -- avx-ifma.asm, avx-ifma-64.asm test files -- unsized operands allowed Checked with XED version: [v2024.04.01] --- test/avx-ifma-64.asm | 6 ++++++ test/avx-ifma.asm | 7 +++++++ x86/insns.dat | 8 ++++---- 3 files changed, 17 insertions(+), 4 deletions(-) create mode 100644 test/avx-ifma-64.asm create mode 100644 test/avx-ifma.asm diff --git a/test/avx-ifma-64.asm b/test/avx-ifma-64.asm new file mode 100644 index 000000000..b6273d097 --- /dev/null +++ b/test/avx-ifma-64.asm @@ -0,0 +1,6 @@ +bits 64 + cpu latevex + vpmadd52luq xmm0, xmm1, [rax] + vpmadd52luq ymm2, ymm3, [rbx] + vpmadd52huq xmm14, xmm5, [rax+rbx] + vpmadd52huq ymm12, ymm7, [rax*2] diff --git a/test/avx-ifma.asm b/test/avx-ifma.asm new file mode 100644 index 000000000..89b43b153 --- /dev/null +++ b/test/avx-ifma.asm @@ -0,0 +1,7 @@ +bits 32 + cpu latevex + vpmadd52luq xmm0, xmm1, [eax] + vpmadd52luq ymm2, ymm3, [ebx] + vpmadd52huq xmm4, xmm5, [eax+ebx] + vpmadd52huq ymm6, ymm7, [eax*2] + diff --git a/x86/insns.dat b/x86/insns.dat index 9bb5ddc10..eb18ca2ea 100644 --- a/x86/insns.dat +++ b/x86/insns.dat @@ -3645,10 +3645,10 @@ VPDPBUUDS ymmreg,ymmreg,ymmrm256 [rvm: vex.256.np.0f38.w0 51 /r] AVXVNNIINT ;# AVX Integer Fused Multiply-Add ; Must precede AVX-512 versions -VPMADD52HUQ xmmreg,xmmreg,xmmrm128 [rvm: vex.128.66.0f38.w1 b5 /r] AVXIFMA,FUTURE,LATEVEX,SX -VPMADD52HUQ ymmreg,ymmreg,ymmrm256 [rvm: vex.256.66.0f38.w1 b5 /r] AVXIFMA,FUTURE,LATEVEX,SY -VPMADD52LUQ xmmreg,xmmreg,xmmrm128 [rvm: vex.128.66.0f38.w1 b4 /r] AVXIFMA,FUTURE,LATEVEX,SX -VPMADD52LUQ ymmreg,ymmreg,ymmrm256 [rvm: vex.256.66.0f38.w1 b4 /r] AVXIFMA,FUTURE,LATEVEX,SY +VPMADD52HUQ xmmreg,xmmreg,xmmrm128 [rvm: vex.128.66.0f38.w1 b5 /r] AVXIFMA,FUTURE,LATEVEX +VPMADD52HUQ ymmreg,ymmreg,ymmrm256 [rvm: vex.256.66.0f38.w1 b5 /r] AVXIFMA,FUTURE,LATEVEX +VPMADD52LUQ xmmreg,xmmreg,xmmrm128 [rvm: vex.128.66.0f38.w1 b4 /r] AVXIFMA,FUTURE,LATEVEX +VPMADD52LUQ ymmreg,ymmreg,ymmrm256 [rvm: vex.256.66.0f38.w1 b4 /r] AVXIFMA,FUTURE,LATEVEX ;# AVX-512 mask register instructions KADDB kreg,kreg,kreg [rvm: vex.nds.l1.66.0f.w0 4a /r ] FUTURE From 6387b82e55da2bf875e86e6c2df62e4fc99fd2f4 Mon Sep 17 00:00:00 2001 From: InstLatx64 Date: Fri, 12 Jul 2024 14:29:59 +0200 Subject: [PATCH 08/12] AVX-VNNI-INT16 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 6, AVX-VNNI-INT16 support - According to Intel® Architecture Instruction Set Extensions and Future Features 319433-053, p. 123-124: -- no AVX512 EVEX version, so LATEVEX not required -- avx-vnni-int16.asm, avx-vnni-int16-64.asm test files -- unsized operands allowed Checked with XED version: [v2024.04.01] --- test/avx-vnni-int16-64.asm | 62 ++++++++++++++++++++++++++++++++++++++ test/avx-vnni-int16.asm | 62 ++++++++++++++++++++++++++++++++++++++ x86/iflags.ph | 1 + x86/insns.dat | 14 +++++++++ 4 files changed, 139 insertions(+) create mode 100644 test/avx-vnni-int16-64.asm create mode 100644 test/avx-vnni-int16.asm diff --git a/test/avx-vnni-int16-64.asm b/test/avx-vnni-int16-64.asm new file mode 100644 index 000000000..9854cdbda --- /dev/null +++ b/test/avx-vnni-int16-64.asm @@ -0,0 +1,62 @@ +BITS 64 + vpdpwsud xmm1, xmm2, xmm0 + vpdpwsud xmm2, xmm3, [rax] + vpdpwsud xmm3, xmm4, [rax+0x12] + vpdpwsud xmm4, xmm5, [rax+rbx*2] + + vpdpwsud ymm1, ymm2, ymm0 + vpdpwsud ymm2, ymm3, [rax] + vpdpwsud ymm3, ymm14, [rax+0x12] + vpdpwsud ymm14, ymm5, [rax+rbx*2] + + vpdpwsuds xmm1, xmm2, xmm0 + vpdpwsuds xmm2, xmm3, [rax] + vpdpwsuds xmm3, xmm14, [rax+0x12] + vpdpwsuds xmm14, xmm5, [rax+rbx*2] + + vpdpwsuds ymm1, ymm2, ymm0 + vpdpwsuds ymm2, ymm3, [rax] + vpdpwsuds ymm3, ymm14, [rax+0x12] + vpdpwsuds ymm14, ymm5, [rax+rbx*2] + + vpdpwusd xmm1, xmm2, xmm0 + vpdpwusd xmm2, xmm3, [rax] + vpdpwusd xmm3, xmm14, [rax+0x12] + vpdpwusd xmm14, xmm5, [rax+rbx*2] + + vpdpwusd ymm1, ymm2, ymm0 + vpdpwusd ymm2, ymm3, [rax] + vpdpwusd ymm3, ymm14, [rax+0x12] + vpdpwusd ymm14, ymm5, [rax+rbx*2] + + vpdpwusds xmm1, xmm2, xmm0 + vpdpwusds xmm2, xmm3, [rax] + vpdpwusds xmm3, xmm14, [rax+0x12] + vpdpwusds xmm14, xmm5, [rax+rbx*2] + + vpdpwusds ymm1, ymm2, ymm0 + vpdpwusds ymm2, ymm3, [rax] + vpdpwusds ymm3, ymm14, [rax+0x12] + vpdpwusds ymm14, ymm5, [rax+rbx*2] + + vpdpwuud xmm1, xmm2, xmm0 + vpdpwuud xmm2, xmm3, [rax] + vpdpwuud xmm3, xmm14, [rax+0x12] + vpdpwuud xmm14, xmm5, [rax+rbx*2] + + vpdpwuud ymm1, ymm2, ymm0 + vpdpwuud ymm2, ymm3, [rax] + vpdpwuud ymm3, ymm14, [rax+0x12] + vpdpwuud ymm14, ymm5, [rax+rbx*2] + + vpdpwuuds xmm1, xmm2, xmm0 + vpdpwuuds xmm2, xmm3, [rax] + vpdpwuuds xmm3, xmm14, [rax+0x12] + vpdpwuuds xmm14, xmm5, [rax+rbx*2] + + vpdpwuuds ymm1, ymm2, ymm0 + vpdpwuuds ymm2, ymm3, [rax] + vpdpwuuds ymm3, ymm14, [rax+0x12] + vpdpwuuds ymm14, ymm5, [rax+rbx*2] + + diff --git a/test/avx-vnni-int16.asm b/test/avx-vnni-int16.asm new file mode 100644 index 000000000..b9e4eb23f --- /dev/null +++ b/test/avx-vnni-int16.asm @@ -0,0 +1,62 @@ +BITS 32 + vpdpwsud xmm1, xmm2, xmm0 + vpdpwsud xmm2, xmm3, [eax] + vpdpwsud xmm3, xmm4, [eax+0x12] + vpdpwsud xmm4, xmm5, [eax+ebx*2] + + vpdpwsud ymm1, ymm2, ymm0 + vpdpwsud ymm2, ymm3, [eax] + vpdpwsud ymm3, ymm4, [eax+0x12] + vpdpwsud ymm4, ymm5, [eax+ebx*2] + + vpdpwsuds xmm1, xmm2, xmm0 + vpdpwsuds xmm2, xmm3, [eax] + vpdpwsuds xmm3, xmm4, [eax+0x12] + vpdpwsuds xmm4, xmm5, [eax+ebx*2] + + vpdpwsuds ymm1, ymm2, ymm0 + vpdpwsuds ymm2, ymm3, [eax] + vpdpwsuds ymm3, ymm4, [eax+0x12] + vpdpwsuds ymm4, ymm5, [eax+ebx*2] + + vpdpwusd xmm1, xmm2, xmm0 + vpdpwusd xmm2, xmm3, [eax] + vpdpwusd xmm3, xmm4, [eax+0x12] + vpdpwusd xmm4, xmm5, [eax+ebx*2] + + vpdpwusd ymm1, ymm2, ymm0 + vpdpwusd ymm2, ymm3, [eax] + vpdpwusd ymm3, ymm4, [eax+0x12] + vpdpwusd ymm4, ymm5, [eax+ebx*2] + + vpdpwusds xmm1, xmm2, xmm0 + vpdpwusds xmm2, xmm3, [eax] + vpdpwusds xmm3, xmm4, [eax+0x12] + vpdpwusds xmm4, xmm5, [eax+ebx*2] + + vpdpwusds ymm1, ymm2, ymm0 + vpdpwusds ymm2, ymm3, [eax] + vpdpwusds ymm3, ymm4, [eax+0x12] + vpdpwusds ymm4, ymm5, [eax+ebx*2] + + vpdpwuud xmm1, xmm2, xmm0 + vpdpwuud xmm2, xmm3, [eax] + vpdpwuud xmm3, xmm4, [eax+0x12] + vpdpwuud xmm4, xmm5, [eax+ebx*2] + + vpdpwuud ymm1, ymm2, ymm0 + vpdpwuud ymm2, ymm3, [eax] + vpdpwuud ymm3, ymm4, [eax+0x12] + vpdpwuud ymm4, ymm5, [eax+ebx*2] + + vpdpwuuds xmm1, xmm2, xmm0 + vpdpwuuds xmm2, xmm3, [eax] + vpdpwuuds xmm3, xmm4, [eax+0x12] + vpdpwuuds xmm4, xmm5, [eax+ebx*2] + + vpdpwuuds ymm1, ymm2, ymm0 + vpdpwuuds ymm2, ymm3, [eax] + vpdpwuuds ymm3, ymm4, [eax+0x12] + vpdpwuuds ymm4, ymm5, [eax+ebx*2] + + diff --git a/x86/iflags.ph b/x86/iflags.ph index f8d331244..3028f0cfe 100644 --- a/x86/iflags.ph +++ b/x86/iflags.ph @@ -118,6 +118,7 @@ if_("SM4", "SM4 instructions"); if_("AMXFP16", "AMX float16 multiplication"); if_("AMXCOMPLEX", "AMX float16 complex multiplication"); if_("AVXVNNI", "AVX Vector Neural Network instructions"); +if_("AVXVNNIINT16", "AVX Vector Neural Network 16-bit integer instructions"); # Put these last to minimize their relevance if_("OBSOLETE", "Instruction removed from architecture"); diff --git a/x86/insns.dat b/x86/insns.dat index eb18ca2ea..aed820f75 100644 --- a/x86/insns.dat +++ b/x86/insns.dat @@ -3643,6 +3643,20 @@ VPDPBUUD ymmreg,ymmreg,ymmrm256 [rvm: vex.256.np.0f38.w0 50 /r] AVXVNNIINT8 VPDPBUUDS xmmreg,xmmreg,xmmrm128 [rvm: vex.128.np.0f38.w0 51 /r] AVXVNNIINT8,FUTURE,AVX VPDPBUUDS ymmreg,ymmreg,ymmrm256 [rvm: vex.256.np.0f38.w0 51 /r] AVXVNNIINT8,FUTURE,AVX +;# AVX Vector Neural Network Instructions INT16 +VPDPWSUD xmmreg,xmmreg,xmmrm128 [rvm: vex.128.f3.0f38.w0 d2 /r] AVXVNNIINT16,FUTURE,AVX +VPDPWSUD ymmreg,ymmreg,ymmrm256 [rvm: vex.256.f3.0f38.w0 d2 /r] AVXVNNIINT16,FUTURE,AVX +VPDPWSUDS xmmreg,xmmreg,xmmrm128 [rvm: vex.128.f3.0f38.w0 d3 /r] AVXVNNIINT16,FUTURE,AVX +VPDPWSUDS ymmreg,ymmreg,ymmrm256 [rvm: vex.256.f3.0f38.w0 d3 /r] AVXVNNIINT16,FUTURE,AVX +VPDPWUSD xmmreg,xmmreg,xmmrm128 [rvm: vex.128.66.0f38.w0 d2 /r] AVXVNNIINT16,FUTURE,AVX +VPDPWUSD ymmreg,ymmreg,ymmrm256 [rvm: vex.256.66.0f38.w0 d2 /r] AVXVNNIINT16,FUTURE,AVX +VPDPWUSDS xmmreg,xmmreg,xmmrm128 [rvm: vex.128.66.0f38.w0 d3 /r] AVXVNNIINT16,FUTURE,AVX +VPDPWUSDS ymmreg,ymmreg,ymmrm256 [rvm: vex.256.66.0f38.w0 d3 /r] AVXVNNIINT16,FUTURE,AVX +VPDPWUUD xmmreg,xmmreg,xmmrm128 [rvm: vex.128.np.0f38.w0 d2 /r] AVXVNNIINT16,FUTURE,AVX +VPDPWUUD ymmreg,ymmreg,ymmrm256 [rvm: vex.256.np.0f38.w0 d2 /r] AVXVNNIINT16,FUTURE,AVX +VPDPWUUDS xmmreg,xmmreg,xmmrm128 [rvm: vex.128.np.0f38.w0 d3 /r] AVXVNNIINT16,FUTURE,AVX +VPDPWUUDS ymmreg,ymmreg,ymmrm256 [rvm: vex.256.np.0f38.w0 d3 /r] AVXVNNIINT16,FUTURE,AVX + ;# AVX Integer Fused Multiply-Add ; Must precede AVX-512 versions VPMADD52HUQ xmmreg,xmmreg,xmmrm128 [rvm: vex.128.66.0f38.w1 b5 /r] AVXIFMA,FUTURE,LATEVEX From e8b7cd44dede3f7923bb9bed52858f3f83bb1fb2 Mon Sep 17 00:00:00 2001 From: InstLatx64 Date: Wed, 17 Jul 2024 14:01:56 +0200 Subject: [PATCH 09/12] fix: LATEVEX for VCVTNEPS2BF16 -- According to Intel SDM 325462-084US, p. 1989, VCVTNEPS2BF16 has EVEX form too, so LATEVEX required -- only for VCVTNEPS2BF16 required explicit size-operator -- LATEVEX for VCVTNEPS2BF16 in avx-ne-convert.asm, avx-ne-convert-64.asm test files Checked with XED version: [v2024.04.01] --- test/avx-ne-convert-64.asm | 1 + test/avx-ne-convert.asm | 1 + x86/insns.dat | 28 ++++++++++++++-------------- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/test/avx-ne-convert-64.asm b/test/avx-ne-convert-64.asm index 1d38ed0cd..9f4ce7447 100644 --- a/test/avx-ne-convert-64.asm +++ b/test/avx-ne-convert-64.asm @@ -11,6 +11,7 @@ BITS 64 vcvtneobf162ps ymm1, yword [rcx] vcvtneoph2ps xmm1, oword [rbx] vcvtneoph2ps ymm1, yword [rcx] + cpu latevex vcvtneps2bf16 xmm1, xmm2 vcvtneps2bf16 xmm1, ymm2 vcvtneps2bf16 xmm1, oword [rbx] diff --git a/test/avx-ne-convert.asm b/test/avx-ne-convert.asm index fd99f2bf4..3239aebe4 100644 --- a/test/avx-ne-convert.asm +++ b/test/avx-ne-convert.asm @@ -11,6 +11,7 @@ BITS 32 vcvtneobf162ps ymm1, yword [ecx] vcvtneoph2ps xmm1, oword [ebx] vcvtneoph2ps ymm1, yword [ecx] + cpu latevex vcvtneps2bf16 xmm1, xmm2 vcvtneps2bf16 xmm1, ymm2 vcvtneps2bf16 xmm1, oword [ebx] diff --git a/x86/insns.dat b/x86/insns.dat index aed820f75..3fed00c07 100644 --- a/x86/insns.dat +++ b/x86/insns.dat @@ -3603,20 +3603,20 @@ VSM4RNDS4 xmmreg,xmmreg,xmmrm128 [rvm: vex.nds.128.f2.0f38.w0 VSM4RNDS4 ymmreg,ymmreg,ymmrm128 [rvm: vex.nds.256.f2.0f38.w0 da /r] SM4,AVX,FUTURE ;# AVX no exception conversions -VBCSTNEBF162PS xmmreg,mem16 [rm: vex.128.f3.0f38.w0 b1 /r] AVXNECONVERT,FUTURE,AVX,SW -VBCSTNEBF162PS ymmreg,mem16 [rm: vex.256.f3.0f38.w0 b1 /r] AVXNECONVERT,FUTURE,AVX,SW -VBCSTNESH2PS xmmreg,mem16 [rm: vex.128.66.0f38.w0 b1 /r] AVXNECONVERT,FUTURE,AVX,SW -VBCSTNESH2PS ymmreg,mem16 [rm: vex.256.66.0f38.w0 b1 /r] AVXNECONVERT,FUTURE,AVX,SW -VCVTNEEBF162PS xmmreg,mem128 [rm: vex.128.f3.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX,SX -VCVTNEEBF162PS ymmreg,mem256 [rm: vex.256.f3.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX,SY -VCVTNEEPH2PS xmmreg,mem128 [rm: vex.128.66.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX,SX -VCVTNEEPH2PS ymmreg,mem256 [rm: vex.256.66.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX,SY -VCVTNEOBF162PS xmmreg,mem128 [rm: vex.128.f2.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX,SX -VCVTNEOBF162PS ymmreg,mem256 [rm: vex.256.f2.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX,SY -VCVTNEOPH2PS xmmreg,mem128 [rm: vex.128.np.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX,SX -VCVTNEOPH2PS ymmreg,mem256 [rm: vex.256.np.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX,SY -VCVTNEPS2BF16 xmmreg,xmmrm128 [rm: vex.128.f3.0f38.w0 72 /r] AVXNECONVERT,FUTURE,AVX,SX -VCVTNEPS2BF16 xmmreg,ymmrm256 [rm: vex.256.f3.0f38.w0 72 /r] AVXNECONVERT,FUTURE,AVX,SY +VBCSTNEBF162PS xmmreg,mem16 [rm: vex.128.f3.0f38.w0 b1 /r] AVXNECONVERT,FUTURE,AVX +VBCSTNEBF162PS ymmreg,mem16 [rm: vex.256.f3.0f38.w0 b1 /r] AVXNECONVERT,FUTURE,AVX +VBCSTNESH2PS xmmreg,mem16 [rm: vex.128.66.0f38.w0 b1 /r] AVXNECONVERT,FUTURE,AVX +VBCSTNESH2PS ymmreg,mem16 [rm: vex.256.66.0f38.w0 b1 /r] AVXNECONVERT,FUTURE,AVX +VCVTNEEBF162PS xmmreg,mem128 [rm: vex.128.f3.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX +VCVTNEEBF162PS ymmreg,mem256 [rm: vex.256.f3.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX +VCVTNEEPH2PS xmmreg,mem128 [rm: vex.128.66.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX +VCVTNEEPH2PS ymmreg,mem256 [rm: vex.256.66.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX +VCVTNEOBF162PS xmmreg,mem128 [rm: vex.128.f2.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX +VCVTNEOBF162PS ymmreg,mem256 [rm: vex.256.f2.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX +VCVTNEOPH2PS xmmreg,mem128 [rm: vex.128.np.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX +VCVTNEOPH2PS ymmreg,mem256 [rm: vex.256.np.0f38.w0 b0 /r] AVXNECONVERT,FUTURE,AVX +VCVTNEPS2BF16 xmmreg,xmmrm128 [rm: vex.128.f3.0f38.w0 72 /r] AVXNECONVERT,FUTURE,LATEVEX,SX +VCVTNEPS2BF16 xmmreg,ymmrm256 [rm: vex.256.f3.0f38.w0 72 /r] AVXNECONVERT,FUTURE,LATEVEX,SY ;# AVX Vector Neural Network Instructions ; Must precede AVX-512 versions From 0b029400951fa17d134d574d5ffe2ed24e653730 Mon Sep 17 00:00:00 2001 From: InstLatx64 Date: Wed, 17 Jul 2024 14:04:51 +0200 Subject: [PATCH 10/12] fix: AVX512_BF16 support -- finished according to Intel SDM 325462-084US -- avx512bf16.asm, avx512bf16-64.asm test files Checked with XED version: [v2024.04.01] --- test/avx512bf16-64.asm | 108 +++++++++++++++++++++++++++++++++++++++++ test/avx512bf16.asm | 108 +++++++++++++++++++++++++++++++++++++++++ x86/insns.dat | 18 +++---- 3 files changed, 225 insertions(+), 9 deletions(-) create mode 100644 test/avx512bf16-64.asm create mode 100644 test/avx512bf16.asm diff --git a/test/avx512bf16-64.asm b/test/avx512bf16-64.asm new file mode 100644 index 000000000..d532cd7cd --- /dev/null +++ b/test/avx512bf16-64.asm @@ -0,0 +1,108 @@ +BITS 64 + vcvtne2ps2bf16 xmm1, xmm2, xmm3 + vcvtne2ps2bf16 ymm1, ymm2, ymm3 + vcvtne2ps2bf16 zmm1, zmm2, zmm3 + + vcvtneps2bf16 xmm1, xmm2 + vcvtneps2bf16 xmm1, ymm2 + vcvtneps2bf16 ymm1, zmm2 + + vdpbf16ps xmm1, xmm2, xmm3 + vdpbf16ps ymm1, ymm2, ymm3 + vdpbf16ps zmm1, zmm2, zmm3 + + vcvtne2ps2bf16 xmm1, xmm2, [rax] + vcvtne2ps2bf16 ymm1, ymm2, [rcx+1] + vcvtne2ps2bf16 zmm1, zmm2, [2*rdx+64] + + vcvtneps2bf16 xmm1, oword [rax] + vcvtneps2bf16 xmm1, yword [rcx+1] + vcvtneps2bf16 ymm1, [2*rdx+64] + + vdpbf16ps xmm1, xmm2, [rax] + vdpbf16ps ymm1, ymm2, [rcx+1] + vdpbf16ps zmm1, zmm2, [2*rdx+64] + + vcvtne2ps2bf16 xmm1, xmm2, [rax]{1to4} + vcvtne2ps2bf16 ymm1, ymm2, [rcx+1]{1to8} + vcvtne2ps2bf16 zmm1, zmm2, [2*rdx+4]{1to16} + + vcvtneps2bf16 xmm1, [rax]{1to4} + vcvtneps2bf16 xmm1, [rcx+1]{1to8} + vcvtneps2bf16 ymm1, [2*rdx+4]{1to16} + + vdpbf16ps xmm1, xmm2, [rax]{1to4} + vdpbf16ps ymm1, ymm2, [rcx+1]{1to8} + vdpbf16ps zmm1, zmm2, [2*rdx+4]{1to16} + + vcvtne2ps2bf16 xmm1 {k1}, xmm2, xmm3 + vcvtne2ps2bf16 ymm1 {k1}, ymm2, ymm3 + vcvtne2ps2bf16 zmm1 {k1}, zmm2, zmm3 + + vcvtneps2bf16 xmm1 {k1}, xmm2 + vcvtneps2bf16 xmm1 {k1}, ymm2 + vcvtneps2bf16 ymm1 {k1}, zmm2 + + vdpbf16ps xmm1 {k1}, xmm2, xmm3 + vdpbf16ps ymm1 {k1}, ymm2, ymm3 + vdpbf16ps zmm1 {k1}, zmm2, zmm3 + + vcvtne2ps2bf16 xmm1 {k1}, xmm2, [rax] + vcvtne2ps2bf16 ymm1 {k1}, ymm2, [rcx+1] + vcvtne2ps2bf16 zmm1 {k1}, zmm2, [2*rdx+64] + + vcvtneps2bf16 xmm1 {k1}, oword [rax] + vcvtneps2bf16 xmm1 {k1}, yword [rcx+1] + vcvtneps2bf16 ymm1 {k1}, [2*rdx+64] + + vdpbf16ps xmm1 {k1}, xmm2, [rax] + vdpbf16ps ymm1 {k1}, ymm2, [rcx+1] + vdpbf16ps zmm1 {k1}, zmm2, [2*rdx+64] + + vcvtne2ps2bf16 xmm1 {k1}, xmm2, [rax]{1to4} + vcvtne2ps2bf16 ymm1 {k1}, ymm2, [rcx+1]{1to8} + vcvtne2ps2bf16 zmm1 {k1}, zmm2, [2*rdx+4]{1to16} + + vcvtneps2bf16 xmm1 {k1}, [rax]{1to4} + vcvtneps2bf16 xmm1 {k1}, [rcx+1]{1to8} + vcvtneps2bf16 ymm1 {k1}, [2*rdx+4]{1to16} + + vdpbf16ps xmm1 {k1}, xmm2, [rax]{1to4} + vdpbf16ps ymm1 {k1}, ymm2, [rcx+1]{1to8} + vdpbf16ps zmm1 {k1}, zmm2, [2*rdx+4]{1to16} + + vcvtne2ps2bf16 xmm1 {k1}{z}, xmm2, xmm3 + vcvtne2ps2bf16 ymm1 {k1}{z}, ymm2, ymm3 + vcvtne2ps2bf16 zmm1 {k1}{z}, zmm2, zmm3 + + vcvtneps2bf16 xmm1 {k1}{z}, xmm2 + vcvtneps2bf16 xmm1 {k1}{z}, ymm2 + vcvtneps2bf16 ymm1 {k1}{z}, zmm2 + + vdpbf16ps xmm1 {k1}{z}, xmm2, xmm3 + vdpbf16ps ymm1 {k1}{z}, ymm2, ymm3 + vdpbf16ps zmm1 {k1}{z}, zmm2, zmm3 + + vcvtne2ps2bf16 xmm1 {k1}{z}, xmm2, [rax] + vcvtne2ps2bf16 ymm1 {k1}{z}, ymm2, [rcx+1] + vcvtne2ps2bf16 zmm1 {k1}{z}, zmm2, [2*rdx+64] + + vcvtneps2bf16 xmm1 {k1}{z}, oword [rax] + vcvtneps2bf16 xmm1 {k1}{z}, yword [rcx+1] + vcvtneps2bf16 ymm1 {k1}{z}, [2*rax+64] + + vdpbf16ps xmm1 {k1}{z}, xmm2, [rax] + vdpbf16ps ymm1 {k1}{z}, ymm2, [rcx+1] + vdpbf16ps zmm1 {k1}{z}, zmm2, [2*rdx+64] + + vcvtne2ps2bf16 xmm1 {k1}{z}, xmm2, [rax]{1to4} + vcvtne2ps2bf16 ymm1 {k1}{z}, ymm2, [rcx+1]{1to8} + vcvtne2ps2bf16 zmm1 {k1}{z}, zmm2, [2*rdx+4]{1to16} + + vcvtneps2bf16 xmm1 {k1}{z}, [rax]{1to4} + vcvtneps2bf16 xmm1 {k1}{z}, [rcx+1]{1to8} + vcvtneps2bf16 ymm1 {k1}{z}, [2*rdx+4]{1to16} + + vdpbf16ps xmm1 {k1}{z}, xmm2, [rax]{1to4} + vdpbf16ps ymm1 {k1}{z}, ymm2, [rcx+1]{1to8} + vdpbf16ps zmm1 {k1}{z}, zmm2, [2*rdx+4]{1to16} diff --git a/test/avx512bf16.asm b/test/avx512bf16.asm new file mode 100644 index 000000000..b9301a3f0 --- /dev/null +++ b/test/avx512bf16.asm @@ -0,0 +1,108 @@ +BITS 32 + vcvtne2ps2bf16 xmm1, xmm2, xmm3 + vcvtne2ps2bf16 ymm1, ymm2, ymm3 + vcvtne2ps2bf16 zmm1, zmm2, zmm3 + + vcvtneps2bf16 xmm1, xmm2 + vcvtneps2bf16 xmm1, ymm2 + vcvtneps2bf16 ymm1, zmm2 + + vdpbf16ps xmm1, xmm2, xmm3 + vdpbf16ps ymm1, ymm2, ymm3 + vdpbf16ps zmm1, zmm2, zmm3 + + vcvtne2ps2bf16 xmm1, xmm2, [eax] + vcvtne2ps2bf16 ymm1, ymm2, [ecx+1] + vcvtne2ps2bf16 zmm1, zmm2, [2*edx+64] + + vcvtneps2bf16 xmm1, oword [eax] + vcvtneps2bf16 xmm1, yword [ecx+1] + vcvtneps2bf16 ymm1, [2*edx+64] + + vdpbf16ps xmm1, xmm2, [eax] + vdpbf16ps ymm1, ymm2, [ecx+1] + vdpbf16ps zmm1, zmm2, [2*edx+64] + + vcvtne2ps2bf16 xmm1, xmm2, [eax]{1to4} + vcvtne2ps2bf16 ymm1, ymm2, [ecx+1]{1to8} + vcvtne2ps2bf16 zmm1, zmm2, [2*edx+4]{1to16} + + vcvtneps2bf16 xmm1, [eax]{1to4} + vcvtneps2bf16 xmm1, [ecx+1]{1to8} + vcvtneps2bf16 ymm1, [2*edx+4]{1to16} + + vdpbf16ps xmm1, xmm2, [eax]{1to4} + vdpbf16ps ymm1, ymm2, [ecx+1]{1to8} + vdpbf16ps zmm1, zmm2, [2*edx+4]{1to16} + + vcvtne2ps2bf16 xmm1 {k1}, xmm2, xmm3 + vcvtne2ps2bf16 ymm1 {k1}, ymm2, ymm3 + vcvtne2ps2bf16 zmm1 {k1}, zmm2, zmm3 + + vcvtneps2bf16 xmm1 {k1}, xmm2 + vcvtneps2bf16 xmm1 {k1}, ymm2 + vcvtneps2bf16 ymm1 {k1}, zmm2 + + vdpbf16ps xmm1 {k1}, xmm2, xmm3 + vdpbf16ps ymm1 {k1}, ymm2, ymm3 + vdpbf16ps zmm1 {k1}, zmm2, zmm3 + + vcvtne2ps2bf16 xmm1 {k1}, xmm2, [eax] + vcvtne2ps2bf16 ymm1 {k1}, ymm2, [ecx+1] + vcvtne2ps2bf16 zmm1 {k1}, zmm2, [2*edx+64] + + vcvtneps2bf16 xmm1 {k1}, oword [eax] + vcvtneps2bf16 xmm1 {k1}, yword [ecx+1] + vcvtneps2bf16 ymm1 {k1}, [2*edx+64] + + vdpbf16ps xmm1 {k1}, xmm2, [eax] + vdpbf16ps ymm1 {k1}, ymm2, [ecx+1] + vdpbf16ps zmm1 {k1}, zmm2, [2*edx+64] + + vcvtne2ps2bf16 xmm1 {k1}, xmm2, [eax]{1to4} + vcvtne2ps2bf16 ymm1 {k1}, ymm2, [ecx+1]{1to8} + vcvtne2ps2bf16 zmm1 {k1}, zmm2, [2*edx+4]{1to16} + + vcvtneps2bf16 xmm1 {k1}, [eax]{1to4} + vcvtneps2bf16 xmm1 {k1}, [ecx+1]{1to8} + vcvtneps2bf16 ymm1 {k1}, [2*edx+4]{1to16} + + vdpbf16ps xmm1 {k1}, xmm2, [eax]{1to4} + vdpbf16ps ymm1 {k1}, ymm2, [ecx+1]{1to8} + vdpbf16ps zmm1 {k1}, zmm2, [2*edx+4]{1to16} + + vcvtne2ps2bf16 xmm1 {k1}, xmm2, xmm3 + vcvtne2ps2bf16 ymm1 {k1}, ymm2, ymm3 + vcvtne2ps2bf16 zmm1 {k1}, zmm2, zmm3 + + vcvtneps2bf16 xmm1 {k1}, xmm2 + vcvtneps2bf16 xmm1 {k1}, ymm2 + vcvtneps2bf16 ymm1 {k1}, zmm2 + + vdpbf16ps xmm1 {k1}{z}, xmm2, xmm3 + vdpbf16ps ymm1 {k1}{z}, ymm2, ymm3 + vdpbf16ps zmm1 {k1}{z}, zmm2, zmm3 + + vcvtne2ps2bf16 xmm1 {k1}{z}, xmm2, [eax] + vcvtne2ps2bf16 ymm1 {k1}{z}, ymm2, [ecx+1] + vcvtne2ps2bf16 zmm1 {k1}{z}, zmm2, [2*edx+64] + + vcvtneps2bf16 xmm1 {k1}{z}, oword [eax] + vcvtneps2bf16 xmm1 {k1}{z}, yword [ecx+1] + vcvtneps2bf16 ymm1 {k1}{z}, [2*edx+64] + + vdpbf16ps xmm1 {k1}{z}, xmm2, [eax] + vdpbf16ps ymm1 {k1}{z}, ymm2, [ecx+1] + vdpbf16ps zmm1 {k1}{z}, zmm2, [2*edx+64] + + vcvtne2ps2bf16 xmm1 {k1}{z}, xmm2, [eax]{1to4} + vcvtne2ps2bf16 ymm1 {k1}{z}, ymm2, [ecx+1]{1to8} + vcvtne2ps2bf16 zmm1 {k1}{z}, zmm2, [2*edx+4]{1to16} + + vcvtneps2bf16 xmm1 {k1}{z}, [eax]{1to4} + vcvtneps2bf16 xmm1 {k1}{z}, [ecx+1]{1to8} + vcvtneps2bf16 ymm1 {k1}{z}, [2*edx+4]{1to16} + + vdpbf16ps xmm1 {k1}{z}, xmm2, [eax]{1to4} + vdpbf16ps ymm1 {k1}{z}, ymm2, [ecx+1]{1to8} + vdpbf16ps zmm1 {k1}{z}, zmm2, [2*edx+4]{1to16} diff --git a/x86/insns.dat b/x86/insns.dat index 3fed00c07..4b7fa63d5 100644 --- a/x86/insns.dat +++ b/x86/insns.dat @@ -6099,15 +6099,15 @@ XRESLDTRK void [ f2 0f 01 e9] TSXLDTRK,FUTURE XSUSLDTRK void [ f2 0f 01 e8] TSXLDTRK,FUTURE ;# AVX512 Bfloat16 instructions -VCVTNE2PS2BF16 xmmreg|mask|z,xmmreg*,xmmrm128|b32 [rvm:fv: evex.128.f2.0f38.w0 72 /r] AVX512BF16,FUTURE -VCVTNE2PS2BF16 ymmreg|mask|z,ymmreg*,ymmrm256|b32 [rvm:fv: evex.256.f2.0f38.w0 72 /r] AVX512BF16,FUTURE -VCVTNE2PS2BF16 zmmreg|mask|z,zmmreg*,zmmrm512|b32 [rvm:fv: evex.512.f2.0f38.w0 72 /r] AVX512BF16,FUTURE -VCVTNEPS2BF16 xmmreg|mask|z,xmmreg*,xmmrm128|b32 [rvm:fv: evex.128.f3.0f38.w0 72 /r] AVX512BF16,FUTURE -VCVTNEPS2BF16 ymmreg|mask|z,ymmreg*,ymmrm256|b32 [rvm:fv: evex.256.f3.0f38.w0 72 /r] AVX512BF16,FUTURE -VCVTNEPS2BF16 zmmreg|mask|z,zmmreg*,zmmrm512|b32 [rvm:fv: evex.512.f3.0f38.w0 72 /r] AVX512BF16,FUTURE -VDPBF16PS xmmreg|mask|z,xmmreg*,xmmrm128|b32 [rvm:fv: evex.128.f3.0f38.w0 52 /r] AVX512BF16,FUTURE -VDPBF16PS ymmreg|mask|z,ymmreg*,ymmrm128|b32 [rvm:fv: evex.256.f3.0f38.w0 52 /r] AVX512BF16,FUTURE -VDPBF16PS zmmreg|mask|z,zmmreg*,zmmrm128|b32 [rvm:fv: evex.512.f3.0f38.w0 52 /r] AVX512BF16,FUTURE +VCVTNE2PS2BF16 xmmreg|mask|z,xmmreg*,xmmrm128|b32 [rvm:fv: evex.128.f2.0f38.w0 72 /r] AVX512VL,AVX512BF16,FUTURE +VCVTNE2PS2BF16 ymmreg|mask|z,ymmreg*,ymmrm256|b32 [rvm:fv: evex.256.f2.0f38.w0 72 /r] AVX512VL,AVX512BF16,FUTURE +VCVTNE2PS2BF16 zmmreg|mask|z,zmmreg*,zmmrm512|b32 [rvm:fv: evex.512.f2.0f38.w0 72 /r] AVX512,AVX512BF16,FUTURE +VCVTNEPS2BF16 xmmreg|mask|z,xmmrm128|b32 [rm:fv: evex.128.f3.0f38.w0 72 /r] AVX512VL,AVX512BF16,FUTURE +VCVTNEPS2BF16 xmmreg|mask|z,ymmrm256|b32 [rm:fv: evex.256.f3.0f38.w0 72 /r] AVX512VL,AVX512BF16,FUTURE +VCVTNEPS2BF16 ymmreg|mask|z,zmmrm512|b32 [rm:fv: evex.512.f3.0f38.w0 72 /r] AVX512,AVX512BF16,FUTURE +VDPBF16PS xmmreg|mask|z,xmmreg*,xmmrm128|b32 [rvm:fv: evex.128.f3.0f38.w0 52 /r] AVX512VL,AVX512BF16,FUTURE +VDPBF16PS ymmreg|mask|z,ymmreg*,ymmrm256|b32 [rvm:fv: evex.256.f3.0f38.w0 52 /r] AVX512VL,AVX512BF16,FUTURE +VDPBF16PS zmmreg|mask|z,zmmreg*,zmmrm512|b32 [rvm:fv: evex.512.f3.0f38.w0 52 /r] AVX512,AVX512BF16,FUTURE ;# AVX512 mask intersect instructions VP2INTERSECTD kreg|rs2,xmmreg,xmmrm128|b32 [rvm:fv: evex.nds.128.f2.0f38.w0 68 /r] AVX512BF16,FUTURE From 01d28e4edb2c31e60e4d7619090deb95e3778e5e Mon Sep 17 00:00:00 2001 From: InstLatx64 Date: Wed, 17 Jul 2024 14:07:40 +0200 Subject: [PATCH 11/12] fix: AVX512_VP2INTERSECT support -- finished according to Intel SDM 325462-084US -- avx512vp2intersect.asm, avx512vp2intersect-64.asm test files Checked with XED version: [v2024.04.01] --- test/avx512vp2intersect-64.asm | 48 ++++++++++++++++++++++++++++++++++ test/avx512vp2intersect.asm | 48 ++++++++++++++++++++++++++++++++++ x86/insns.dat | 9 ++++--- 3 files changed, 102 insertions(+), 3 deletions(-) create mode 100644 test/avx512vp2intersect-64.asm create mode 100644 test/avx512vp2intersect.asm diff --git a/test/avx512vp2intersect-64.asm b/test/avx512vp2intersect-64.asm new file mode 100644 index 000000000..288a5179e --- /dev/null +++ b/test/avx512vp2intersect-64.asm @@ -0,0 +1,48 @@ +BITS 64 + vp2intersectd k0, xmm1, xmm2 + vp2intersectd k0, ymm1, ymm2 + vp2intersectd k0, zmm1, zmm2 + + vp2intersectq k0, xmm1, xmm2 + vp2intersectq k0, ymm1, ymm2 + vp2intersectq k0, zmm1, zmm2 + + vp2intersectd k1, xmm1, xmm2 + vp2intersectd k1, ymm1, ymm2 + vp2intersectd k1, zmm1, zmm2 + + vp2intersectq k1, xmm1, xmm2 + vp2intersectq k1, ymm1, ymm2 + vp2intersectq k1, zmm1, zmm2 + + vp2intersectd k0, xmm1, [rax] + vp2intersectd k0, ymm1, [rcx+1] + vp2intersectd k0, zmm1, [2*rdx+64] + + vp2intersectq k0, xmm1, [rax] + vp2intersectq k0, ymm1, [rcx+1] + vp2intersectq k0, zmm1, [2*rdx+64] + + vp2intersectd k1, xmm1, [rax] + vp2intersectd k1, ymm1, [rcx+1] + vp2intersectd k1, zmm1, [2*rdx+64] + + vp2intersectq k1, xmm1, [rax] + vp2intersectq k1, ymm1, [rcx+1] + vp2intersectq k1, zmm1, [2*rdx+64] + + vp2intersectd k0, xmm1, [rax]{1to4} + vp2intersectd k0, ymm1, [rcx+1]{1to8} + vp2intersectd k0, zmm1, [2*rdx+4]{1to16} + + vp2intersectq k0, xmm1, [rax]{1to2} + vp2intersectq k0, ymm1, [rcx+1]{1to4} + vp2intersectq k0, zmm1, [2*rdx+8]{1to8} + + vp2intersectd k1, xmm1, [rax]{1to4} + vp2intersectd k1, ymm1, [rcx+1]{1to8} + vp2intersectd k1, zmm1, [2*rdx+4]{1to16} + + vp2intersectq k1, xmm1, [rax]{1to2} + vp2intersectq k1, ymm1, [rcx+1]{1to4} + vp2intersectq k1, zmm1, [2*rdx+8]{1to8} diff --git a/test/avx512vp2intersect.asm b/test/avx512vp2intersect.asm new file mode 100644 index 000000000..56ce4b66a --- /dev/null +++ b/test/avx512vp2intersect.asm @@ -0,0 +1,48 @@ +BITS 32 + vp2intersectd k0, xmm1, xmm2 + vp2intersectd k0, ymm1, ymm2 + vp2intersectd k0, zmm1, zmm2 + + vp2intersectq k0, xmm1, xmm2 + vp2intersectq k0, ymm1, ymm2 + vp2intersectq k0, zmm1, zmm2 + + vp2intersectd k1, xmm1, xmm2 + vp2intersectd k1, ymm1, ymm2 + vp2intersectd k1, zmm1, zmm2 + + vp2intersectq k1, xmm1, xmm2 + vp2intersectq k1, ymm1, ymm2 + vp2intersectq k1, zmm1, zmm2 + + vp2intersectd k0, xmm1, [eax] + vp2intersectd k0, ymm1, [ecx+1] + vp2intersectd k0, zmm1, [2*edx+64] + + vp2intersectq k0, xmm1, [eax] + vp2intersectq k0, ymm1, [ecx+1] + vp2intersectq k0, zmm1, [2*edx+64] + + vp2intersectd k1, xmm1, [eax] + vp2intersectd k1, ymm1, [ecx+1] + vp2intersectd k1, zmm1, [2*edx+64] + + vp2intersectq k1, xmm1, [eax] + vp2intersectq k1, ymm1, [ecx+1] + vp2intersectq k1, zmm1, [2*edx+64] + + vp2intersectd k0, xmm1, [eax]{1to4} + vp2intersectd k0, ymm1, [ecx+1]{1to8} + vp2intersectd k0, zmm1, [2*edx+4]{1to16} + + vp2intersectq k0, xmm1, [eax]{1to2} + vp2intersectq k0, ymm1, [ecx+1]{1to4} + vp2intersectq k0, zmm1, [2*edx+8]{1to8} + + vp2intersectd k1, xmm1, [eax]{1to4} + vp2intersectd k1, ymm1, [ecx+1]{1to8} + vp2intersectd k1, zmm1, [2*edx+4]{1to16} + + vp2intersectq k1, xmm1, [eax]{1to2} + vp2intersectq k1, ymm1, [ecx+1]{1to4} + vp2intersectq k1, zmm1, [2*edx+8]{1to8} diff --git a/x86/insns.dat b/x86/insns.dat index 4b7fa63d5..49b5264dd 100644 --- a/x86/insns.dat +++ b/x86/insns.dat @@ -6110,9 +6110,12 @@ VDPBF16PS ymmreg|mask|z,ymmreg*,ymmrm256|b32 [rvm:fv: evex.256.f3.0f38.w0 52 /r] VDPBF16PS zmmreg|mask|z,zmmreg*,zmmrm512|b32 [rvm:fv: evex.512.f3.0f38.w0 52 /r] AVX512,AVX512BF16,FUTURE ;# AVX512 mask intersect instructions -VP2INTERSECTD kreg|rs2,xmmreg,xmmrm128|b32 [rvm:fv: evex.nds.128.f2.0f38.w0 68 /r] AVX512BF16,FUTURE -VP2INTERSECTD kreg|rs2,ymmreg,ymmrm128|b32 [rvm:fv: evex.nds.256.f2.0f38.w0 68 /r] AVX512BF16,FUTURE -VP2INTERSECTD kreg|rs2,zmmreg,zmmrm128|b32 [rvm:fv: evex.nds.512.f2.0f38.w0 68 /r] AVX512BF16,FUTURE +VP2INTERSECTD kreg|rs2,xmmreg,xmmrm128|b32 [rvm:fv: evex.nds.128.f2.0f38.w0 68 /r] AVX512VL,AVX512VP2INTERSECT,FUTURE +VP2INTERSECTD kreg|rs2,ymmreg,ymmrm256|b32 [rvm:fv: evex.nds.256.f2.0f38.w0 68 /r] AVX512VL,AVX512VP2INTERSECT,FUTURE +VP2INTERSECTD kreg|rs2,zmmreg,zmmrm512|b32 [rvm:fv: evex.nds.512.f2.0f38.w0 68 /r] AVX512,AVX512VP2INTERSECT,FUTURE +VP2INTERSECTQ kreg|rs2,xmmreg,xmmrm128|b64 [rvm:fv: evex.nds.128.f2.0f38.w1 68 /r] AVX512VL,AVX512VP2INTERSECT,FUTURE +VP2INTERSECTQ kreg|rs2,ymmreg,ymmrm256|b64 [rvm:fv: evex.nds.256.f2.0f38.w1 68 /r] AVX512VL,AVX512VP2INTERSECT,FUTURE +VP2INTERSECTQ kreg|rs2,zmmreg,zmmrm512|b64 [rvm:fv: evex.nds.512.f2.0f38.w1 68 /r] AVX512,AVX512VP2INTERSECT,FUTURE ;# Intel Advanced Matrix Extensions (AMX) LDTILECFG mem512 [m: vex.128.np.0f38.w0 49 /0] AMXTILE,FUTURE,SZ,LONG From e8355b0ab699cfa9e9a49e3f63a03b63003631b9 Mon Sep 17 00:00:00 2001 From: InstLatx64 Date: Fri, 2 Aug 2024 19:22:29 +0200 Subject: [PATCH 12/12] fix: AVX512_FP16 FMA mnemonics -- VF[N]M[ADD|SUB][132|213|213][S|P]H --- x86/insns.dat | 96 +++++++++++++++++++++++++-------------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/x86/insns.dat b/x86/insns.dat index 49b5264dd..0e3f66624 100644 --- a/x86/insns.dat +++ b/x86/insns.dat @@ -6277,54 +6277,54 @@ VFMSUBADD213PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.ma VFMSUBADD231PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 b7 /r] AVX512FP16,AVX512VL,FUTURE VFMSUBADD231PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 b7 /r] AVX512FP16,AVX512VL,FUTURE VFMSUBADD231PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 b7 /r] AVX512FP16,FUTURE -VPMADD132PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 98 /r] AVX512FP16,AVX512VL,FUTURE -VPMADD132PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 98 /r] AVX512FP16,AVX512VL,FUTURE -VPMADD132PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 98 /r] AVX512FP16,FUTURE -VPMADD213PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 a8 /r] AVX512FP16,AVX512VL,FUTURE -VPMADD213PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 a8 /r] AVX512FP16,AVX512VL,FUTURE -VPMADD213PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 a8 /r] AVX512FP16,FUTURE -VPMADD231PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 b8 /r] AVX512FP16,AVX512VL,FUTURE -VPMADD231PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 b8 /r] AVX512FP16,AVX512VL,FUTURE -VPMADD231PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 b8 /r] AVX512FP16,FUTURE -VFMADD132PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 9c /r] AVX512FP16,AVX512VL,FUTURE -VFMADD132PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 9c /r] AVX512FP16,AVX512VL,FUTURE -VFMADD132PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 9c /r] AVX512FP16,FUTURE -VFMADD213PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 ac /r] AVX512FP16,AVX512VL,FUTURE -VFMADD213PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 ac /r] AVX512FP16,AVX512VL,FUTURE -VFMADD213PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 ac /r] AVX512FP16,FUTURE -VFMADD231PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 bc /r] AVX512FP16,AVX512VL,FUTURE -VFMADD231PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 bc /r] AVX512FP16,AVX512VL,FUTURE -VFMADD231PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 bc /r] AVX512FP16,FUTURE -VPMADD132SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 99 /r] AVX512FP16,FUTURE -VPMADD213SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 a9 /r] AVX512FP16,FUTURE -VPMADD231SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 b9 /r] AVX512FP16,FUTURE -VPNMADD132SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 9d /r] AVX512FP16,FUTURE -VPNMADD213SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 ad /r] AVX512FP16,FUTURE -VPNMADD231SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 bd /r] AVX512FP16,FUTURE -VPMSUB132PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 9a /r] AVX512FP16,AVX512VL,FUTURE -VPMSUB132PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 9a /r] AVX512FP16,AVX512VL,FUTURE -VPMSUB132PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 9a /r] AVX512FP16,FUTURE -VPMSUB213PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 aa /r] AVX512FP16,AVX512VL,FUTURE -VPMSUB213PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 aa /r] AVX512FP16,AVX512VL,FUTURE -VPMSUB213PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 aa /r] AVX512FP16,FUTURE -VPMSUB231PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 ba /r] AVX512FP16,AVX512VL,FUTURE -VPMSUB231PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 ba /r] AVX512FP16,AVX512VL,FUTURE -VPMSUB231PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 ba /r] AVX512FP16,FUTURE -VFMSUB132PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 9e /r] AVX512FP16,AVX512VL,FUTURE -VFMSUB132PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 9e /r] AVX512FP16,AVX512VL,FUTURE -VFMSUB132PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 9e /r] AVX512FP16,FUTURE -VFMSUB213PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 ae /r] AVX512FP16,AVX512VL,FUTURE -VFMSUB213PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 ae /r] AVX512FP16,AVX512VL,FUTURE -VFMSUB213PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 ae /r] AVX512FP16,FUTURE -VFMSUB231PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 be /r] AVX512FP16,AVX512VL,FUTURE -VFMSUB231PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 be /r] AVX512FP16,AVX512VL,FUTURE -VFMSUB231PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 be /r] AVX512FP16,FUTURE -VPMSUB132SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 9b /r] AVX512FP16,FUTURE -VPMSUB213SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 ab /r] AVX512FP16,FUTURE -VPMSUB231SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 bb /r] AVX512FP16,FUTURE -VPNMSUB132SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 9f /r] AVX512FP16,FUTURE -VPNMSUB213SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 af /r] AVX512FP16,FUTURE -VPNMSUB231SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 bf /r] AVX512FP16,FUTURE +VFMADD132PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 98 /r] AVX512FP16,AVX512VL,FUTURE +VFMADD132PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 98 /r] AVX512FP16,AVX512VL,FUTURE +VFMADD132PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 98 /r] AVX512FP16,FUTURE +VFMADD213PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 a8 /r] AVX512FP16,AVX512VL,FUTURE +VFMADD213PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 a8 /r] AVX512FP16,AVX512VL,FUTURE +VFMADD213PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 a8 /r] AVX512FP16,FUTURE +VFMADD231PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 b8 /r] AVX512FP16,AVX512VL,FUTURE +VFMADD231PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 b8 /r] AVX512FP16,AVX512VL,FUTURE +VFMADD231PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 b8 /r] AVX512FP16,FUTURE +VFNMADD132PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 9c /r] AVX512FP16,AVX512VL,FUTURE +VFNMADD132PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 9c /r] AVX512FP16,AVX512VL,FUTURE +VFNMADD132PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 9c /r] AVX512FP16,FUTURE +VFNMADD213PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 ac /r] AVX512FP16,AVX512VL,FUTURE +VFNMADD213PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 ac /r] AVX512FP16,AVX512VL,FUTURE +VFNMADD213PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 ac /r] AVX512FP16,FUTURE +VFNMADD231PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 bc /r] AVX512FP16,AVX512VL,FUTURE +VFNMADD231PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 bc /r] AVX512FP16,AVX512VL,FUTURE +VFNMADD231PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 bc /r] AVX512FP16,FUTURE +VFMADD132SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 99 /r] AVX512FP16,FUTURE +VFMADD213SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 a9 /r] AVX512FP16,FUTURE +VFMADD231SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 b9 /r] AVX512FP16,FUTURE +VFNMADD132SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 9d /r] AVX512FP16,FUTURE +VFNMADD213SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 ad /r] AVX512FP16,FUTURE +VFNMADD231SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 bd /r] AVX512FP16,FUTURE +VFMSUB132PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 9a /r] AVX512FP16,AVX512VL,FUTURE +VFMSUB132PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 9a /r] AVX512FP16,AVX512VL,FUTURE +VFMSUB132PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 9a /r] AVX512FP16,FUTURE +VFMSUB213PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 aa /r] AVX512FP16,AVX512VL,FUTURE +VFMSUB213PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 aa /r] AVX512FP16,AVX512VL,FUTURE +VFMSUB213PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 aa /r] AVX512FP16,FUTURE +VFMSUB231PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 ba /r] AVX512FP16,AVX512VL,FUTURE +VFMSUB231PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 ba /r] AVX512FP16,AVX512VL,FUTURE +VFMSUB231PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 ba /r] AVX512FP16,FUTURE +VFNMSUB132PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 9e /r] AVX512FP16,AVX512VL,FUTURE +VFNMSUB132PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 9e /r] AVX512FP16,AVX512VL,FUTURE +VFNMSUB132PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 9e /r] AVX512FP16,FUTURE +VFNMSUB213PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 ae /r] AVX512FP16,AVX512VL,FUTURE +VFNMSUB213PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 ae /r] AVX512FP16,AVX512VL,FUTURE +VFNMSUB213PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 ae /r] AVX512FP16,FUTURE +VFNMSUB231PH xmmreg|mask|z,xmmreg*,xmmrm128|b16 [rvm:fv: evex.nds.128.66.map6.w0 be /r] AVX512FP16,AVX512VL,FUTURE +VFNMSUB231PH ymmreg|mask|z,ymmreg*,ymmrm256|b16 [rvm:fv: evex.nds.256.66.map6.w0 be /r] AVX512FP16,AVX512VL,FUTURE +VFNMSUB231PH zmmreg|mask|z,zmmreg*,zmmrm512|b16|er [rvm:fv: evex.nds.512.66.map6.w0 be /r] AVX512FP16,FUTURE +VFMSUB132SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 9b /r] AVX512FP16,FUTURE +VFMSUB213SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 ab /r] AVX512FP16,FUTURE +VFMSUB231SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 bb /r] AVX512FP16,FUTURE +VFNMSUB132SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 9f /r] AVX512FP16,FUTURE +VFNMSUB213SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 af /r] AVX512FP16,FUTURE +VFNMSUB231SH xmmreg|mask|z,xmmreg*,xmmrm16|er [rvm:t1s: evex.nds.lig.66.map6.w0 bf /r] AVX512FP16,FUTURE VFPCLASSPH kreg|mask,xmmrm128|b16,imm8 [rmi:fv: evex.128.np.0f3a.w0 66 /r ib] AVX512FP16,AVX512VL,FUTURE VFPCLASSPH kreg|mask,ymmrm256|b16,imm8 [rmi:fv: evex.256.np.0f3a.w0 66 /r ib] AVX512FP16,AVX512VL,FUTURE VFPCLASSPH kreg|mask,zmmrm512|b16,imm8 [rmi:fv: evex.512.np.0f3a.w0 66 /r ib] AVX512FP16,FUTURE