Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for the missing SIMD levels from the 53rd Intel ISA Ext. Guide: AMX-FP16, AMX-COMPLEX, AVX-VNNI, AVX-VNNI-INT16 #88

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
25 changes: 17 additions & 8 deletions test/amx.asm
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
bits 64

%macro amx 1
%macro amx 3
%define treg tmm %+ %1
%define treg2 tmm %+ %2
%define treg3 tmm %+ %3

ldtilecfg [rsi]
sttilecfg [rdi]
Expand All @@ -16,11 +18,14 @@
tileloaddt1 treg, [rax,rdx]
tileloaddt1 treg, [rax,rdx*2]

tdpbf16ps treg, treg, treg
tdpbssd treg, treg, treg
tdpbusd treg, treg, treg
tdpbsud treg, treg, treg
tdpbuud treg, treg, treg
tdpbf16ps treg, treg2, treg3
tdpbssd treg, treg2, treg3
tdpbusd treg, treg2, treg3
tdpbsud treg, treg2, treg3
tdpbuud treg, treg2, treg3
tdpfp16ps treg, treg2, treg3
tcmmimfp16ps treg, treg2, treg3
tcmmrlfp16ps treg, treg2, treg3

tilestored [rax], treg
tilestored [rax,rdx], treg
Expand All @@ -30,7 +35,11 @@
%endmacro

%assign n 0
%assign m 1
%assign l 2
%rep 8
amx n
%assign n n+1
amx n, m, l
%assign n ((n+1) % 8)
%assign m ((m+1) % 8)
%assign l ((l+1) % 8)
%endrep
6 changes: 6 additions & 0 deletions test/avx-ifma-64.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
bits 64
cpu latevex
vpmadd52luq xmm0, xmm1, [rax]
vpmadd52luq ymm2, ymm3, [rbx]
vpmadd52huq xmm14, xmm5, [rax+rbx]
vpmadd52huq ymm12, ymm7, [rax*2]
7 changes: 7 additions & 0 deletions test/avx-ifma.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
bits 32
cpu latevex
vpmadd52luq xmm0, xmm1, [eax]
vpmadd52luq ymm2, ymm3, [ebx]
vpmadd52huq xmm4, xmm5, [eax+ebx]
vpmadd52huq ymm6, ymm7, [eax*2]

21 changes: 21 additions & 0 deletions test/avx-ne-convert-64.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
BITS 64
vbcstnebf162ps xmm1, [rax]
vbcstnebf162ps ymm1, [rax]
vbcstnesh2ps xmm1, [rax]
vbcstnesh2ps ymm1, [rax]
vcvtneebf162ps xmm1, oword [rbx]
vcvtneebf162ps ymm1, yword [rcx]
vcvtneeph2ps xmm1, oword [rbx]
vcvtneeph2ps ymm1, yword [rcx]
vcvtneobf162ps xmm1, oword [rbx]
vcvtneobf162ps ymm1, yword [rcx]
vcvtneoph2ps xmm1, oword [rbx]
vcvtneoph2ps ymm1, yword [rcx]
cpu latevex
vcvtneps2bf16 xmm1, xmm2
vcvtneps2bf16 xmm1, ymm2
vcvtneps2bf16 xmm1, oword [rbx]
vcvtneps2bf16 xmm1, yword [rbx]



18 changes: 18 additions & 0 deletions test/avx-ne-convert.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
BITS 32
vbcstnebf162ps xmm1, [eax]
vbcstnebf162ps ymm1, [eax]
vbcstnesh2ps xmm1, [eax]
vbcstnesh2ps ymm1, [eax]
vcvtneebf162ps xmm1, oword [ebx]
vcvtneebf162ps ymm1, yword [ecx]
vcvtneeph2ps xmm1, oword [ebx]
vcvtneeph2ps ymm1, yword [ecx]
vcvtneobf162ps xmm1, oword [ebx]
vcvtneobf162ps ymm1, yword [ecx]
vcvtneoph2ps xmm1, oword [ebx]
vcvtneoph2ps ymm1, yword [ecx]
cpu latevex
vcvtneps2bf16 xmm1, xmm2
vcvtneps2bf16 xmm1, ymm2
vcvtneps2bf16 xmm1, oword [ebx]
vcvtneps2bf16 xmm1, yword [ebx]
41 changes: 41 additions & 0 deletions test/avx-vnni-64.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
BITS 64
cpu latevex
vpdpbusd xmm1, xmm2, xmm0
vpdpbusd xmm2, xmm3, [rax]
vpdpbusd xmm3, xmm4, [rax+0x12]
vpdpbusd xmm4, xmm5, [rax+rbx*2]

vpdpbusd ymm1, ymm2, ymm0
vpdpbusd ymm2, ymm3, [rax]
vpdpbusd ymm3, ymm4, [rax+0x12]
vpdpbusd ymm4, ymm5, [rax+rbx*2]

vpdpbusds xmm1, xmm2, xmm0
vpdpbusds xmm2, xmm3, [rax]
vpdpbusds xmm3, xmm4, [rax+0x12]
vpdpbusds xmm4, xmm5, [rax+rbx*2]

vpdpbusds ymm1, ymm2, ymm0
vpdpbusds ymm2, ymm3, [rax]
vpdpbusds ymm3, ymm4, [rax+0x12]
vpdpbusds ymm4, ymm5, [rax+rbx*2]

vpdpwssd xmm1, xmm2, xmm0
vpdpwssd xmm2, xmm3, [rax]
vpdpwssd xmm3, xmm4, [rax+0x12]
vpdpwssd xmm4, xmm5, [rax+rbx*2]

vpdpwssd ymm1, ymm2, ymm0
vpdpwssd ymm2, ymm3, [rax]
vpdpwssd ymm3, ymm4, [rax+0x12]
vpdpwssd ymm4, ymm5, [rax+rbx*2]

vpdpwssds xmm1, xmm2, xmm0
vpdpwssds xmm2, xmm3, [rax]
vpdpwssds xmm3, xmm4, [rax+0x12]
vpdpwssds xmm4, xmm5, [rax+rbx*2]

vpdpwssds ymm1, ymm2, ymm0
vpdpwssds ymm2, ymm3, [rax]
vpdpwssds ymm3, ymm4, [rax+0x12]
vpdpwssds ymm4, ymm5, [rax+rbx*2]
62 changes: 62 additions & 0 deletions test/avx-vnni-int16-64.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
BITS 64
vpdpwsud xmm1, xmm2, xmm0
vpdpwsud xmm2, xmm3, [rax]
vpdpwsud xmm3, xmm4, [rax+0x12]
vpdpwsud xmm4, xmm5, [rax+rbx*2]

vpdpwsud ymm1, ymm2, ymm0
vpdpwsud ymm2, ymm3, [rax]
vpdpwsud ymm3, ymm14, [rax+0x12]
vpdpwsud ymm14, ymm5, [rax+rbx*2]

vpdpwsuds xmm1, xmm2, xmm0
vpdpwsuds xmm2, xmm3, [rax]
vpdpwsuds xmm3, xmm14, [rax+0x12]
vpdpwsuds xmm14, xmm5, [rax+rbx*2]

vpdpwsuds ymm1, ymm2, ymm0
vpdpwsuds ymm2, ymm3, [rax]
vpdpwsuds ymm3, ymm14, [rax+0x12]
vpdpwsuds ymm14, ymm5, [rax+rbx*2]

vpdpwusd xmm1, xmm2, xmm0
vpdpwusd xmm2, xmm3, [rax]
vpdpwusd xmm3, xmm14, [rax+0x12]
vpdpwusd xmm14, xmm5, [rax+rbx*2]

vpdpwusd ymm1, ymm2, ymm0
vpdpwusd ymm2, ymm3, [rax]
vpdpwusd ymm3, ymm14, [rax+0x12]
vpdpwusd ymm14, ymm5, [rax+rbx*2]

vpdpwusds xmm1, xmm2, xmm0
vpdpwusds xmm2, xmm3, [rax]
vpdpwusds xmm3, xmm14, [rax+0x12]
vpdpwusds xmm14, xmm5, [rax+rbx*2]

vpdpwusds ymm1, ymm2, ymm0
vpdpwusds ymm2, ymm3, [rax]
vpdpwusds ymm3, ymm14, [rax+0x12]
vpdpwusds ymm14, ymm5, [rax+rbx*2]

vpdpwuud xmm1, xmm2, xmm0
vpdpwuud xmm2, xmm3, [rax]
vpdpwuud xmm3, xmm14, [rax+0x12]
vpdpwuud xmm14, xmm5, [rax+rbx*2]

vpdpwuud ymm1, ymm2, ymm0
vpdpwuud ymm2, ymm3, [rax]
vpdpwuud ymm3, ymm14, [rax+0x12]
vpdpwuud ymm14, ymm5, [rax+rbx*2]

vpdpwuuds xmm1, xmm2, xmm0
vpdpwuuds xmm2, xmm3, [rax]
vpdpwuuds xmm3, xmm14, [rax+0x12]
vpdpwuuds xmm14, xmm5, [rax+rbx*2]

vpdpwuuds ymm1, ymm2, ymm0
vpdpwuuds ymm2, ymm3, [rax]
vpdpwuuds ymm3, ymm14, [rax+0x12]
vpdpwuuds ymm14, ymm5, [rax+rbx*2]


62 changes: 62 additions & 0 deletions test/avx-vnni-int16.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
BITS 32
vpdpwsud xmm1, xmm2, xmm0
vpdpwsud xmm2, xmm3, [eax]
vpdpwsud xmm3, xmm4, [eax+0x12]
vpdpwsud xmm4, xmm5, [eax+ebx*2]

vpdpwsud ymm1, ymm2, ymm0
vpdpwsud ymm2, ymm3, [eax]
vpdpwsud ymm3, ymm4, [eax+0x12]
vpdpwsud ymm4, ymm5, [eax+ebx*2]

vpdpwsuds xmm1, xmm2, xmm0
vpdpwsuds xmm2, xmm3, [eax]
vpdpwsuds xmm3, xmm4, [eax+0x12]
vpdpwsuds xmm4, xmm5, [eax+ebx*2]

vpdpwsuds ymm1, ymm2, ymm0
vpdpwsuds ymm2, ymm3, [eax]
vpdpwsuds ymm3, ymm4, [eax+0x12]
vpdpwsuds ymm4, ymm5, [eax+ebx*2]

vpdpwusd xmm1, xmm2, xmm0
vpdpwusd xmm2, xmm3, [eax]
vpdpwusd xmm3, xmm4, [eax+0x12]
vpdpwusd xmm4, xmm5, [eax+ebx*2]

vpdpwusd ymm1, ymm2, ymm0
vpdpwusd ymm2, ymm3, [eax]
vpdpwusd ymm3, ymm4, [eax+0x12]
vpdpwusd ymm4, ymm5, [eax+ebx*2]

vpdpwusds xmm1, xmm2, xmm0
vpdpwusds xmm2, xmm3, [eax]
vpdpwusds xmm3, xmm4, [eax+0x12]
vpdpwusds xmm4, xmm5, [eax+ebx*2]

vpdpwusds ymm1, ymm2, ymm0
vpdpwusds ymm2, ymm3, [eax]
vpdpwusds ymm3, ymm4, [eax+0x12]
vpdpwusds ymm4, ymm5, [eax+ebx*2]

vpdpwuud xmm1, xmm2, xmm0
vpdpwuud xmm2, xmm3, [eax]
vpdpwuud xmm3, xmm4, [eax+0x12]
vpdpwuud xmm4, xmm5, [eax+ebx*2]

vpdpwuud ymm1, ymm2, ymm0
vpdpwuud ymm2, ymm3, [eax]
vpdpwuud ymm3, ymm4, [eax+0x12]
vpdpwuud ymm4, ymm5, [eax+ebx*2]

vpdpwuuds xmm1, xmm2, xmm0
vpdpwuuds xmm2, xmm3, [eax]
vpdpwuuds xmm3, xmm4, [eax+0x12]
vpdpwuuds xmm4, xmm5, [eax+ebx*2]

vpdpwuuds ymm1, ymm2, ymm0
vpdpwuuds ymm2, ymm3, [eax]
vpdpwuuds ymm3, ymm4, [eax+0x12]
vpdpwuuds ymm4, ymm5, [eax+ebx*2]


62 changes: 62 additions & 0 deletions test/avx-vnni-int8-64.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
BITS 64
vpdpbsud xmm1, xmm2, xmm0
vpdpbsud xmm2, xmm3, [rax]
vpdpbsud xmm3, xmm14, oword [rax+0x12]
vpdpbsud xmm14, xmm5, [rax+rbx*2]

vpdpbsud ymm1, ymm2, ymm0
vpdpbsud ymm2, ymm3, [rax]
vpdpbsud ymm3, ymm14, yword [rax+0x12]
vpdpbsud ymm14, ymm5, [rax+rbx*2]

vpdpbsuds xmm1, xmm2, xmm0
vpdpbsuds xmm2, xmm3, [rax]
vpdpbsuds xmm3, xmm14, [rax+0x12]
vpdpbsuds xmm14, xmm5, [rax+rbx*2]

vpdpbsuds ymm1, ymm2, ymm0
vpdpbsuds ymm2, ymm3, [rax]
vpdpbsuds ymm3, ymm14, [rax+0x12]
vpdpbsuds ymm14, ymm5, [rax+rbx*2]

vpdpbssd xmm1, xmm2, xmm0
vpdpbssd xmm2, xmm3, [rax]
vpdpbssd xmm3, xmm14, [rax+0x12]
vpdpbssd xmm14, xmm5, [rax+rbx*2]

vpdpbssd ymm1, ymm2, ymm0
vpdpbssd ymm2, ymm3, [rax]
vpdpbssd ymm3, ymm14, [rax+0x12]
vpdpbssd ymm14, ymm5, [rax+rbx*2]

vpdpbssds xmm1, xmm2, xmm0
vpdpbssds xmm2, xmm3, [rax]
vpdpbssds xmm3, xmm14, [rax+0x12]
vpdpbssds xmm14, xmm5, [rax+rbx*2]

vpdpbssds ymm1, ymm2, ymm0
vpdpbssds ymm2, ymm3, [rax]
vpdpbssds ymm3, ymm14, [rax+0x12]
vpdpbssds ymm14, ymm5, [rax+rbx*2]

vpdpbuud xmm1, xmm2, xmm0
vpdpbuud xmm2, xmm3, [rax]
vpdpbuud xmm3, xmm14, [rax+0x12]
vpdpbuud xmm14, xmm5, [rax+rbx*2]

vpdpbuud ymm1, ymm2, ymm0
vpdpbuud ymm2, ymm3, [rax]
vpdpbuud ymm3, ymm14, [rax+0x12]
vpdpbuud ymm14, ymm5, [rax+rbx*2]

vpdpbuuds xmm1, xmm2, xmm0
vpdpbuuds xmm2, xmm3, [rax]
vpdpbuuds xmm3, xmm14, [rax+0x12]
vpdpbuuds xmm14, xmm5, [rax+rbx*2]

vpdpbuuds ymm1, ymm2, ymm0
vpdpbuuds ymm2, ymm3, [rax]
vpdpbuuds ymm3, ymm14, [rax+0x12]
vpdpbuuds ymm14, ymm5, [rax+rbx*2]


62 changes: 62 additions & 0 deletions test/avx-vnni-int8.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
BITS 32
vpdpbsud xmm1, xmm2, xmm0
vpdpbsud xmm2, xmm3, [eax]
vpdpbsud xmm3, xmm4, [eax+0x12]
vpdpbsud xmm4, xmm5, [eax+ebx*2]

vpdpbsud ymm1, ymm2, ymm0
vpdpbsud ymm2, ymm3, [eax]
vpdpbsud ymm3, ymm4, [eax+0x12]
vpdpbsud ymm4, ymm5, [eax+ebx*2]

vpdpbsuds xmm1, xmm2, xmm0
vpdpbsuds xmm2, xmm3, [eax]
vpdpbsuds xmm3, xmm4, [eax+0x12]
vpdpbsuds xmm4, xmm5, [eax+ebx*2]

vpdpbsuds ymm1, ymm2, ymm0
vpdpbsuds ymm2, ymm3, [eax]
vpdpbsuds ymm3, ymm4, [eax+0x12]
vpdpbsuds ymm4, ymm5, [eax+ebx*2]

vpdpbssd xmm1, xmm2, xmm0
vpdpbssd xmm2, xmm3, [eax]
vpdpbssd xmm3, xmm4, [eax+0x12]
vpdpbssd xmm4, xmm5, [eax+ebx*2]

vpdpbssd ymm1, ymm2, ymm0
vpdpbssd ymm2, ymm3, [eax]
vpdpbssd ymm3, ymm4, [eax+0x12]
vpdpbssd ymm4, ymm5, [eax+ebx*2]

vpdpbssds xmm1, xmm2, xmm0
vpdpbssds xmm2, xmm3, [eax]
vpdpbssds xmm3, xmm4, [eax+0x12]
vpdpbssds xmm4, xmm5, [eax+ebx*2]

vpdpbssds ymm1, ymm2, ymm0
vpdpbssds ymm2, ymm3, [eax]
vpdpbssds ymm3, ymm4, [eax+0x12]
vpdpbssds ymm4, ymm5, [eax+ebx*2]

vpdpbuud xmm1, xmm2, xmm0
vpdpbuud xmm2, xmm3, [eax]
vpdpbuud xmm3, xmm4, [eax+0x12]
vpdpbuud xmm4, xmm5, [eax+ebx*2]

vpdpbuud ymm1, ymm2, ymm0
vpdpbuud ymm2, ymm3, [eax]
vpdpbuud ymm3, ymm4, [eax+0x12]
vpdpbuud ymm4, ymm5, [eax+ebx*2]

vpdpbuuds xmm1, xmm2, xmm0
vpdpbuuds xmm2, xmm3, [eax]
vpdpbuuds xmm3, xmm4, [eax+0x12]
vpdpbuuds xmm4, xmm5, [eax+ebx*2]

vpdpbuuds ymm1, ymm2, ymm0
vpdpbuuds ymm2, ymm3, [eax]
vpdpbuuds ymm3, ymm4, [eax+0x12]
vpdpbuuds ymm4, ymm5, [eax+ebx*2]


Loading