1#! /usr/bin/env perl 2# Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# May 2011 18# 19# The module implements bn_GF2m_mul_2x2 polynomial multiplication used 20# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for 21# the time being... Except that it has two code paths: code suitable 22# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and 23# later. Improvement varies from one benchmark and µ-arch to another. 24# Vanilla code path is at most 20% faster than compiler-generated code 25# [not very impressive], while PCLMULQDQ - whole 85%-160% better on 26# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that 27# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not 28# all CPU time is burnt in it... 29 30# $output is the last argument if it looks like a file (it has an extension) 31# $flavour is the first argument if it doesn't look like a file 32$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 33$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 34 35$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 36 37$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 38( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 39( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 40die "can't locate x86_64-xlate.pl"; 41 42open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 43 or die "can't call $xlate: $!"; 44*STDOUT=*OUT; 45 46($lo,$hi)=("%rax","%rdx"); $a=$lo; 47($i0,$i1)=("%rsi","%rdi"); 48($t0,$t1)=("%rbx","%rcx"); 49($b,$mask)=("%rbp","%r8"); 50($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15)); 51($R,$Tx)=("%xmm0","%xmm1"); 52 53$code.=<<___; 54.text 55 56.type _mul_1x1,\@abi-omnipotent 57.align 16 58_mul_1x1: 59.cfi_startproc 60 sub \$128+8,%rsp 61.cfi_adjust_cfa_offset 128+8 62 mov \$-1,$a1 63 lea ($a,$a),$i0 64 shr \$3,$a1 65 lea (,$a,4),$i1 66 and $a,$a1 # a1=a&0x1fffffffffffffff 67 lea (,$a,8),$a8 68 sar \$63,$a # broadcast 63rd bit 69 lea ($a1,$a1),$a2 70 sar \$63,$i0 # broadcast 62nd bit 71 lea (,$a1,4),$a4 72 and $b,$a 73 sar \$63,$i1 # broadcast 61st bit 74 mov $a,$hi # $a is $lo 75 shl \$63,$lo 76 and $b,$i0 77 shr \$1,$hi 78 mov $i0,$t1 79 shl \$62,$i0 80 and $b,$i1 81 shr \$2,$t1 82 xor $i0,$lo 83 mov $i1,$t0 84 shl \$61,$i1 85 xor $t1,$hi 86 shr \$3,$t0 87 xor $i1,$lo 88 xor $t0,$hi 89 90 mov $a1,$a12 91 movq \$0,0(%rsp) # tab[0]=0 92 xor $a2,$a12 # a1^a2 93 mov $a1,8(%rsp) # tab[1]=a1 94 mov $a4,$a48 95 mov $a2,16(%rsp) # tab[2]=a2 96 xor $a8,$a48 # a4^a8 97 mov $a12,24(%rsp) # tab[3]=a1^a2 98 99 xor $a4,$a1 100 mov $a4,32(%rsp) # tab[4]=a4 101 xor $a4,$a2 102 mov $a1,40(%rsp) # tab[5]=a1^a4 103 xor $a4,$a12 104 mov $a2,48(%rsp) # tab[6]=a2^a4 105 xor $a48,$a1 # a1^a4^a4^a8=a1^a8 106 mov $a12,56(%rsp) # tab[7]=a1^a2^a4 107 xor $a48,$a2 # a2^a4^a4^a8=a1^a8 108 109 mov $a8,64(%rsp) # tab[8]=a8 110 xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8 111 mov $a1,72(%rsp) # tab[9]=a1^a8 112 xor $a4,$a1 # a1^a8^a4 113 mov $a2,80(%rsp) # tab[10]=a2^a8 114 xor $a4,$a2 # a2^a8^a4 115 mov $a12,88(%rsp) # tab[11]=a1^a2^a8 116 117 xor $a4,$a12 # a1^a2^a8^a4 118 mov $a48,96(%rsp) # tab[12]=a4^a8 119 mov $mask,$i0 120 mov $a1,104(%rsp) # tab[13]=a1^a4^a8 121 and $b,$i0 122 mov $a2,112(%rsp) # tab[14]=a2^a4^a8 123 shr \$4,$b 124 mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8 125 mov $mask,$i1 126 and $b,$i1 127 shr \$4,$b 128 129 movq (%rsp,$i0,8),$R # half of calculations is done in SSE2 130 mov $mask,$i0 131 and $b,$i0 132 shr \$4,$b 133___ 134 for ($n=1;$n<8;$n++) { 135 $code.=<<___; 136 mov (%rsp,$i1,8),$t1 137 mov $mask,$i1 138 mov $t1,$t0 139 shl \$`8*$n-4`,$t1 140 and $b,$i1 141 movq (%rsp,$i0,8),$Tx 142 shr \$`64-(8*$n-4)`,$t0 143 xor $t1,$lo 144 pslldq \$$n,$Tx 145 mov $mask,$i0 146 shr \$4,$b 147 xor $t0,$hi 148 and $b,$i0 149 shr \$4,$b 150 pxor $Tx,$R 151___ 152 } 153$code.=<<___; 154 mov (%rsp,$i1,8),$t1 155 mov $t1,$t0 156 shl \$`8*$n-4`,$t1 157 movq $R,$i0 158 shr \$`64-(8*$n-4)`,$t0 159 xor $t1,$lo 160 psrldq \$8,$R 161 xor $t0,$hi 162 movq $R,$i1 163 xor $i0,$lo 164 xor $i1,$hi 165 166 add \$128+8,%rsp 167.cfi_adjust_cfa_offset -128-8 168 ret 169.Lend_mul_1x1: 170.cfi_endproc 171.size _mul_1x1,.-_mul_1x1 172___ 173 174($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order 175 ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order 176 177$code.=<<___; 178.extern OPENSSL_ia32cap_P 179.globl bn_GF2m_mul_2x2 180.type bn_GF2m_mul_2x2,\@abi-omnipotent 181.align 16 182bn_GF2m_mul_2x2: 183.cfi_startproc 184 mov %rsp,%rax 185 mov OPENSSL_ia32cap_P(%rip),%r10 186 bt \$33,%r10 187 jnc .Lvanilla_mul_2x2 188 189 movq $a1,%xmm0 190 movq $b1,%xmm1 191 movq $a0,%xmm2 192___ 193$code.=<<___ if ($win64); 194 movq 40(%rsp),%xmm3 195___ 196$code.=<<___ if (!$win64); 197 movq $b0,%xmm3 198___ 199$code.=<<___; 200 movdqa %xmm0,%xmm4 201 movdqa %xmm1,%xmm5 202 pclmulqdq \$0,%xmm1,%xmm0 # a1·b1 203 pxor %xmm2,%xmm4 204 pxor %xmm3,%xmm5 205 pclmulqdq \$0,%xmm3,%xmm2 # a0·b0 206 pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1) 207 xorps %xmm0,%xmm4 208 xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1 209 movdqa %xmm4,%xmm5 210 pslldq \$8,%xmm4 211 psrldq \$8,%xmm5 212 pxor %xmm4,%xmm2 213 pxor %xmm5,%xmm0 214 movdqu %xmm2,0($rp) 215 movdqu %xmm0,16($rp) 216 ret 217 218.align 16 219.Lvanilla_mul_2x2: 220 lea -8*17(%rsp),%rsp 221.cfi_adjust_cfa_offset 8*17 222___ 223$code.=<<___ if ($win64); 224 mov `8*17+40`(%rsp),$b0 225 mov %rdi,8*15(%rsp) 226 mov %rsi,8*16(%rsp) 227___ 228$code.=<<___; 229 mov %r14,8*10(%rsp) 230.cfi_rel_offset %r14,8*10 231 mov %r13,8*11(%rsp) 232.cfi_rel_offset %r13,8*11 233 mov %r12,8*12(%rsp) 234.cfi_rel_offset %r12,8*12 235 mov %rbp,8*13(%rsp) 236.cfi_rel_offset %rbp,8*13 237 mov %rbx,8*14(%rsp) 238.cfi_rel_offset %rbx,8*14 239.Lbody_mul_2x2: 240 mov $rp,32(%rsp) # save the arguments 241 mov $a1,40(%rsp) 242 mov $a0,48(%rsp) 243 mov $b1,56(%rsp) 244 mov $b0,64(%rsp) 245 246 mov \$0xf,$mask 247 mov $a1,$a 248 mov $b1,$b 249 call _mul_1x1 # a1·b1 250 mov $lo,16(%rsp) 251 mov $hi,24(%rsp) 252 253 mov 48(%rsp),$a 254 mov 64(%rsp),$b 255 call _mul_1x1 # a0·b0 256 mov $lo,0(%rsp) 257 mov $hi,8(%rsp) 258 259 mov 40(%rsp),$a 260 mov 56(%rsp),$b 261 xor 48(%rsp),$a 262 xor 64(%rsp),$b 263 call _mul_1x1 # (a0+a1)·(b0+b1) 264___ 265 @r=("%rbx","%rcx","%rdi","%rsi"); 266$code.=<<___; 267 mov 0(%rsp),@r[0] 268 mov 8(%rsp),@r[1] 269 mov 16(%rsp),@r[2] 270 mov 24(%rsp),@r[3] 271 mov 32(%rsp),%rbp 272 273 xor $hi,$lo 274 xor @r[1],$hi 275 xor @r[0],$lo 276 mov @r[0],0(%rbp) 277 xor @r[2],$hi 278 mov @r[3],24(%rbp) 279 xor @r[3],$lo 280 xor @r[3],$hi 281 xor $hi,$lo 282 mov $hi,16(%rbp) 283 mov $lo,8(%rbp) 284 285 mov 8*10(%rsp),%r14 286.cfi_restore %r14 287 mov 8*11(%rsp),%r13 288.cfi_restore %r13 289 mov 8*12(%rsp),%r12 290.cfi_restore %r12 291 mov 8*13(%rsp),%rbp 292.cfi_restore %rbp 293 mov 8*14(%rsp),%rbx 294.cfi_restore %rbx 295___ 296$code.=<<___ if ($win64); 297 mov 8*15(%rsp),%rdi 298 mov 8*16(%rsp),%rsi 299___ 300$code.=<<___; 301 lea 8*17(%rsp),%rsp 302.cfi_adjust_cfa_offset -8*17 303.Lepilogue_mul_2x2: 304 ret 305.Lend_mul_2x2: 306.cfi_endproc 307.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 308.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 309.align 16 310___ 311 312# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 313# CONTEXT *context,DISPATCHER_CONTEXT *disp) 314if ($win64) { 315$rec="%rcx"; 316$frame="%rdx"; 317$context="%r8"; 318$disp="%r9"; 319 320$code.=<<___; 321.extern __imp_RtlVirtualUnwind 322 323.type se_handler,\@abi-omnipotent 324.align 16 325se_handler: 326 push %rsi 327 push %rdi 328 push %rbx 329 push %rbp 330 push %r12 331 push %r13 332 push %r14 333 push %r15 334 pushfq 335 sub \$64,%rsp 336 337 mov 120($context),%rax # pull context->Rax 338 mov 248($context),%rbx # pull context->Rip 339 340 lea .Lbody_mul_2x2(%rip),%r10 341 cmp %r10,%rbx # context->Rip<"prologue" label 342 jb .Lin_prologue 343 344 mov 152($context),%rax # pull context->Rsp 345 346 lea .Lepilogue_mul_2x2(%rip),%r10 347 cmp %r10,%rbx # context->Rip>="epilogue" label 348 jae .Lin_prologue 349 350 mov 8*10(%rax),%r14 # mimic epilogue 351 mov 8*11(%rax),%r13 352 mov 8*12(%rax),%r12 353 mov 8*13(%rax),%rbp 354 mov 8*14(%rax),%rbx 355 mov 8*15(%rax),%rdi 356 mov 8*16(%rax),%rsi 357 358 mov %rbx,144($context) # restore context->Rbx 359 mov %rbp,160($context) # restore context->Rbp 360 mov %rsi,168($context) # restore context->Rsi 361 mov %rdi,176($context) # restore context->Rdi 362 mov %r12,216($context) # restore context->R12 363 mov %r13,224($context) # restore context->R13 364 mov %r14,232($context) # restore context->R14 365 366 lea 8*17(%rax),%rax 367 368.Lin_prologue: 369 mov %rax,152($context) # restore context->Rsp 370 371 mov 40($disp),%rdi # disp->ContextRecord 372 mov $context,%rsi # context 373 mov \$154,%ecx # sizeof(CONTEXT) 374 .long 0xa548f3fc # cld; rep movsq 375 376 mov $disp,%rsi 377 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 378 mov 8(%rsi),%rdx # arg2, disp->ImageBase 379 mov 0(%rsi),%r8 # arg3, disp->ControlPc 380 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 381 mov 40(%rsi),%r10 # disp->ContextRecord 382 lea 56(%rsi),%r11 # &disp->HandlerData 383 lea 24(%rsi),%r12 # &disp->EstablisherFrame 384 mov %r10,32(%rsp) # arg5 385 mov %r11,40(%rsp) # arg6 386 mov %r12,48(%rsp) # arg7 387 mov %rcx,56(%rsp) # arg8, (NULL) 388 call *__imp_RtlVirtualUnwind(%rip) 389 390 mov \$1,%eax # ExceptionContinueSearch 391 add \$64,%rsp 392 popfq 393 pop %r15 394 pop %r14 395 pop %r13 396 pop %r12 397 pop %rbp 398 pop %rbx 399 pop %rdi 400 pop %rsi 401 ret 402.size se_handler,.-se_handler 403 404.section .pdata 405.align 4 406 .rva _mul_1x1 407 .rva .Lend_mul_1x1 408 .rva .LSEH_info_1x1 409 410 .rva .Lvanilla_mul_2x2 411 .rva .Lend_mul_2x2 412 .rva .LSEH_info_2x2 413.section .xdata 414.align 8 415.LSEH_info_1x1: 416 .byte 0x01,0x07,0x02,0x00 417 .byte 0x07,0x01,0x11,0x00 # sub rsp,128+8 418.LSEH_info_2x2: 419 .byte 9,0,0,0 420 .rva se_handler 421___ 422} 423 424$code =~ s/\`([^\`]*)\`/eval($1)/gem; 425print $code; 426close STDOUT or die "error closing STDOUT: $!"; 427