Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

have old binutils support on avx512 as well and improve it #2476

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 61 additions & 53 deletions crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
Original file line number Diff line number Diff line change
Expand Up @@ -956,61 +956,69 @@ sub filter_and_print {
# vaesenc/vaesenclast/vpclmulqdq instructions that use XMM registers are NOT
# using the VAES/VPCLMULQDQ features and do not require this workaround.
my %asmMap = (
'vaesenc %ymm2, %ymm12, %ymm12' => '.byte 0xc4,0x62,0x1d,0xdc,0xe2',
'vaesenc %ymm2, %ymm13, %ymm13' => '.byte 0xc4,0x62,0x15,0xdc,0xea',
'vaesenc %ymm2, %ymm14, %ymm14' => '.byte 0xc4,0x62,0x0d,0xdc,0xf2',
'vaesenc %ymm2, %ymm15, %ymm15' => '.byte 0xc4,0x62,0x05,0xdc,0xfa',
'vaesenclast %ymm10, %ymm12, %ymm12' => '.byte 0xc4,0x42,0x1d,0xdd,0xe2',
'vaesenclast %ymm10, %ymm13, %ymm13' => '.byte 0xc4,0x42,0x15,0xdd,0xea',
'vaesenclast %ymm2, %ymm12, %ymm12' => '.byte 0xc4,0x62,0x1d,0xdd,0xe2',
'vaesenclast %ymm3, %ymm13, %ymm13' => '.byte 0xc4,0x62,0x15,0xdd,0xeb',
'vaesenclast %ymm5, %ymm14, %ymm14' => '.byte 0xc4,0x62,0x0d,0xdd,0xf5',
'vaesenclast %ymm6, %ymm15, %ymm15' => '.byte 0xc4,0x62,0x05,0xdd,0xfe',
'vpclmulqdq $0x00, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x00',
'vpclmulqdq $0x00, %ymm2, %ymm12, %ymm5' => '.byte 0xc4,0xe3,0x1d,0x44,0xea,0x00',
'vpclmulqdq $0x00, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x00',
'vpclmulqdq $0x00, %ymm4, %ymm3, %ymm2' => '.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00',
'vpclmulqdq $0x00, %ymm4, %ymm3, %ymm5' => '.byte 0xc4,0xe3,0x65,0x44,0xec,0x00',
'vpclmulqdq $0x00, %ymm5, %ymm3, %ymm0' => '.byte 0xc4,0xe3,0x65,0x44,0xc5,0x00',
'vpclmulqdq $0x00, %ymm5, %ymm4, %ymm0' => '.byte 0xc4,0xe3,0x5d,0x44,0xc5,0x00',
'vpclmulqdq $0x00, %ymm7, %ymm2, %ymm6' => '.byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00',
'vpclmulqdq $0x00, %ymm8, %ymm2, %ymm2' => '.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00',
'vpclmulqdq $0x01, %ymm0, %ymm6, %ymm2' => '.byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01',
'vpclmulqdq $0x01, %ymm1, %ymm6, %ymm0' => '.byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01',
'vpclmulqdq $0x01, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x01',
'vpclmulqdq $0x01, %ymm2, %ymm12, %ymm6' => '.byte 0xc4,0xe3,0x1d,0x44,0xf2,0x01',
'vpclmulqdq $0x01, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x01',
'vpclmulqdq $0x01, %ymm5, %ymm2, %ymm3' => '.byte 0xc4,0xe3,0x6d,0x44,0xdd,0x01',
'vpclmulqdq $0x01, %ymm5, %ymm3, %ymm1' => '.byte 0xc4,0xe3,0x65,0x44,0xcd,0x01',
'vpclmulqdq $0x01, %ymm5, %ymm4, %ymm1' => '.byte 0xc4,0xe3,0x5d,0x44,0xcd,0x01',
'vpclmulqdq $0x01, %ymm5, %ymm4, %ymm2' => '.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01',
'vpclmulqdq $0x01, %ymm6, %ymm2, %ymm3' => '.byte 0xc4,0xe3,0x6d,0x44,0xde,0x01',
'vpclmulqdq $0x01, %ymm6, %ymm4, %ymm2' => '.byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01',
'vpclmulqdq $0x10, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10',
'vpclmulqdq $0x10, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x10',
'vpclmulqdq $0x10, %ymm5, %ymm3, %ymm2' => '.byte 0xc4,0xe3,0x65,0x44,0xd5,0x10',
'vpclmulqdq $0x10, %ymm5, %ymm4, %ymm2' => '.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x10',
'vpclmulqdq $0x10, %ymm7, %ymm2, %ymm2' => '.byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10',
'vpclmulqdq $0x10, %ymm8, %ymm2, %ymm2' => '.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10',
'vpclmulqdq $0x11, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x11',
'vpclmulqdq $0x11, %ymm2, %ymm12, %ymm7' => '.byte 0xc4,0xe3,0x1d,0x44,0xfa,0x11',
'vpclmulqdq $0x11, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x11',
'vpclmulqdq $0x11, %ymm4, %ymm3, %ymm1' => '.byte 0xc4,0xe3,0x65,0x44,0xcc,0x11',
'vpclmulqdq $0x11, %ymm4, %ymm3, %ymm2' => '.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11',
'vpclmulqdq $0x11, %ymm5, %ymm3, %ymm4' => '.byte 0xc4,0xe3,0x65,0x44,0xe5,0x11',
'vpclmulqdq $0x11, %ymm5, %ymm4, %ymm3' => '.byte 0xc4,0xe3,0x5d,0x44,0xdd,0x11',
'vaesenc %ymm2, %ymm12, %ymm12' => '.byte 0xc4,0x62,0x1d,0xdc,0xe2',
'vaesenc %ymm2, %ymm13, %ymm13' => '.byte 0xc4,0x62,0x15,0xdc,0xea',
'vaesenc %ymm2, %ymm14, %ymm14' => '.byte 0xc4,0x62,0x0d,0xdc,0xf2',
'vaesenc %ymm2, %ymm15, %ymm15' => '.byte 0xc4,0x62,0x05,0xdc,0xfa',
'vaesenclast %ymm10, %ymm12, %ymm12' => '.byte 0xc4,0x42,0x1d,0xdd,0xe2',
'vaesenclast %ymm10, %ymm13, %ymm13' => '.byte 0xc4,0x42,0x15,0xdd,0xea',
'vaesenclast %ymm2, %ymm12, %ymm12' => '.byte 0xc4,0x62,0x1d,0xdd,0xe2',
'vaesenclast %ymm3, %ymm13, %ymm13' => '.byte 0xc4,0x62,0x15,0xdd,0xeb',
'vaesenclast %ymm5, %ymm14, %ymm14' => '.byte 0xc4,0x62,0x0d,0xdd,0xf5',
'vaesenclast %ymm6, %ymm15, %ymm15' => '.byte 0xc4,0x62,0x05,0xdd,0xfe',
'vpclmulqdq $0x00, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x00',
'vpclmulqdq $0x00, %ymm2, %ymm12, %ymm5' => '.byte 0xc4,0xe3,0x1d,0x44,0xea,0x00',
'vpclmulqdq $0x00, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x00',
'vpclmulqdq $0x00, %ymm4, %ymm3, %ymm2' => '.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00',
'vpclmulqdq $0x00, %ymm4, %ymm3, %ymm5' => '.byte 0xc4,0xe3,0x65,0x44,0xec,0x00',
'vpclmulqdq $0x00, %ymm5, %ymm3, %ymm0' => '.byte 0xc4,0xe3,0x65,0x44,0xc5,0x00',
'vpclmulqdq $0x00, %ymm5, %ymm4, %ymm0' => '.byte 0xc4,0xe3,0x5d,0x44,0xc5,0x00',
'vpclmulqdq $0x00, %ymm7, %ymm2, %ymm6' => '.byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00',
'vpclmulqdq $0x00, %ymm8, %ymm2, %ymm2' => '.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00',
'vpclmulqdq $0x01, %ymm0, %ymm6, %ymm2' => '.byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01',
'vpclmulqdq $0x01, %ymm1, %ymm6, %ymm0' => '.byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01',
'vpclmulqdq $0x01, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x01',
'vpclmulqdq $0x01, %ymm2, %ymm12, %ymm6' => '.byte 0xc4,0xe3,0x1d,0x44,0xf2,0x01',
'vpclmulqdq $0x01, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x01',
'vpclmulqdq $0x01, %ymm5, %ymm2, %ymm3' => '.byte 0xc4,0xe3,0x6d,0x44,0xdd,0x01',
'vpclmulqdq $0x01, %ymm5, %ymm3, %ymm1' => '.byte 0xc4,0xe3,0x65,0x44,0xcd,0x01',
'vpclmulqdq $0x01, %ymm5, %ymm4, %ymm1' => '.byte 0xc4,0xe3,0x5d,0x44,0xcd,0x01',
'vpclmulqdq $0x01, %ymm5, %ymm4, %ymm2' => '.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01',
'vpclmulqdq $0x01, %ymm6, %ymm2, %ymm3' => '.byte 0xc4,0xe3,0x6d,0x44,0xde,0x01',
'vpclmulqdq $0x01, %ymm6, %ymm4, %ymm2' => '.byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01',
'vpclmulqdq $0x10, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10',
'vpclmulqdq $0x10, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x10',
'vpclmulqdq $0x10, %ymm5, %ymm3, %ymm2' => '.byte 0xc4,0xe3,0x65,0x44,0xd5,0x10',
'vpclmulqdq $0x10, %ymm5, %ymm4, %ymm2' => '.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x10',
'vpclmulqdq $0x10, %ymm7, %ymm2, %ymm2' => '.byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10',
'vpclmulqdq $0x10, %ymm8, %ymm2, %ymm2' => '.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10',
'vpclmulqdq $0x11, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x11',
'vpclmulqdq $0x11, %ymm2, %ymm12, %ymm7' => '.byte 0xc4,0xe3,0x1d,0x44,0xfa,0x11',
'vpclmulqdq $0x11, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x11',
'vpclmulqdq $0x11, %ymm4, %ymm3, %ymm1' => '.byte 0xc4,0xe3,0x65,0x44,0xcc,0x11',
'vpclmulqdq $0x11, %ymm4, %ymm3, %ymm2' => '.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11',
'vpclmulqdq $0x11, %ymm5, %ymm3, %ymm4' => '.byte 0xc4,0xe3,0x65,0x44,0xe5,0x11',
'vpclmulqdq $0x11, %ymm5, %ymm4, %ymm3' => '.byte 0xc4,0xe3,0x5d,0x44,0xdd,0x11',
);
for my $line (split("\n",$code)) {
my $trimmed;
$trimmed = $line;
$trimmed =~ s/^\s+//;
$trimmed =~ s/\s+(#.*)?$//;
if (exists $asmMap{$trimmed}) {
$line = $asmMap{$trimmed};
} else {
if($trimmed =~ /(vpclmulqdq|vaes).*%[yz]mm/) {
die ("found instruction not supported under old binutils, please update asmMap with the results of running\n" .
'find target -name "*aes-gcm-avx2*.o" -exec python3 crypto/fipsmodule/aes/asm/make-avx-map-for-old-binutils.py \{\} \; | LC_ALL=C sort | uniq');
if ( $line =~ /^
(?<prespace>\s*)
(?<instruction>[a-z0-9]+)
\s+
(?<args>([^#]*[^ #])?)
(?<postspace>\s*([#].*)?)
$/x
) {
my $trimmed = $+{instruction} . " " . $+{args};
my $prespace = $+{prespace};
my $postspace = $+{postspace};
if (exists $asmMap{$trimmed}) {
$line = ${prespace} . $asmMap{$trimmed} . ${postspace};
} else {
if($trimmed =~ /(vpclmulqdq|vaes).*%[yz]mm/) {
die ("found instruction not supported under old binutils, please update asmMap with the results of running\n" .
'find target -name "*aes-gcm-avx2*.o" -exec python3 crypto/fipsmodule/aes/asm/make-avx-map-for-old-binutils.py \{\} \; | LC_ALL=C sort | uniq');
}
}
}
print $line,"\n";
Expand Down
38 changes: 37 additions & 1 deletion crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl
Original file line number Diff line number Diff line change
Expand Up @@ -1268,6 +1268,42 @@ sub _aes_gcm_update {
$code .= _aes_gcm_update 0;
$code .= _end_func;

print $code;

sub filter_and_print {
# This function replaces VAES and VPCLMULQDQ assembly instructions with their
# assembled forms, to allow the code to work on versions of binutils older than
# 2.30 that do not support these instructions.
# vaesenc/vaesenclast/vpclmulqdq instructions that use XMM registers are NOT
# using the VAES/VPCLMULQDQ features and do not require this workaround.
my %asmMap = (
# Currently empty, will contain content once we get AVX-512 support
);
for my $line (split("\n",$code)) {
if ( $line =~ /^
(?<prespace>\s*)
(?<instruction>[a-z0-9]+)
\s+
(?<args>([^#]*[^ #])?)
(?<postspace>\s*([#].*)?)
$/x
) {
my $trimmed = $+{instruction} . " " . $+{args};
my $prespace = $+{prespace};
my $postspace = $+{postspace};
if (exists $asmMap{$trimmed}) {
$line = ${prespace} . $asmMap{$trimmed} . ${postspace};
} else {
if($trimmed =~ /(vpclmulqdq|vaes).*%[yz]mm/) {
die ("found instruction not supported under old binutils, please update asmMap with the results of running\n" .
'find target -name "*aes-gcm-avx512*.o" -exec python3 crypto/fipsmodule/aes/asm/make-avx-map-for-old-binutils.py \{\} \; | LC_ALL=C sort | uniq');
}
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have found that this works very poorly when hacking on this file. Here's what happens:

  • I do something that triggers this check, so the build fails.
  • I run the suggested command, but it will only ever find object files that were successfully built, and those successfully-built object files don't use the new forms of the instructions!
  • I have to comment out this check.

Also, the find target has another downside: It will find every object file that's ever been built. Basically, we'll end up with the union of all forms of all these instructions that have ever been needed unless/until cargo clean is done.

There are ways to hack around it but it is ugly.

I think we should move this check to CI, in a separate script. For example, in the package job, we could look at the files generated into pregenerated to find lines that look like VAES/VPCLMULQDQ in the .S files. We could do this in Python instead of Perl. This would solve

The other thing is that this is done for nasm output but we really don't need to do it for nasm. There are important reasons related to Windows ABI stuff where it important that we keep the nasm output readable. So we should avoid doing this rewriting when the output format is nasm. Note that looking only at .S files is the right thing to do, if we're avoiding rewriting the nasm output.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moving this to a script run from CI sounds like a good idea.

}
}
print $line,"\n";
}
}

filter_and_print();

close STDOUT or die "error closing STDOUT: $!";
exit 0;
8 changes: 4 additions & 4 deletions crypto/fipsmodule/aes/asm/make-avx-map-for-old-binutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import re
import sys

PCLMUL_RE = re.compile(r'^\s+[0-9a-f]+:\s+(?P<disas>(?:[0-9a-f][0-9a-f] )+)\s+vpclmul(?P<type>[0-9a-z]+)dq (?P<args>.*%ymm.*)$')
NON_PCLMUL_RE = re.compile(r'^\s+[0-9a-f]+:\s+(?P<disas>(?:[0-9a-f][0-9a-f] )+)\s+(?P<instruction>vaesenc|vaesenclast) (?P<args>.*%ymm.*)$')
PCLMUL_RE = re.compile(r'^\s+[0-9a-f]+:\s+(?P<disas>(?:[0-9a-f][0-9a-f] )+)\s+vpclmul(?P<type>[0-9a-z]+)dq (?P<args>.*%[yz]mm.*)$')
NON_PCLMUL_RE = re.compile(r'^\s+[0-9a-f]+:\s+(?P<disas>(?:[0-9a-f][0-9a-f] )+)\s+(?P<instruction>vaesenc|vaesenclast) (?P<args>.*%[yz]mm.*)$')

TYPE_MAP = {
'lqlq': 0x00,
Expand All @@ -31,11 +31,11 @@ def main():
hexified_disas = hexify_disas(match.group('disas'))
ty = TYPE_MAP[match.group('type')]
args = match.group('args').replace(',', ', ')
print(f" 'vpclmulqdq $0x{ty:02x}, {args}' => '.byte {hexified_disas}',")
print(f" 'vpclmulqdq $0x{ty:02x}, {args}' => '.byte {hexified_disas}',")
elif match := NON_PCLMUL_RE.match(line):
hexified_disas = hexify_disas(match.group('disas'))
args = match.group('args').replace(',', ', ')
print(f" '{match.group('instruction').ljust(16)}{args}' => '.byte {hexified_disas}',")
print(f" '{match.group('instruction')} {args}' => '.byte {hexified_disas}',")


if __name__ == '__main__':
Expand Down