1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# January 2015 18# 19# ChaCha20 for x86. 20# 21# Performance in cycles per byte out of large buffer. 22# 23# 1xIALU/gcc 4xSSSE3 24# Pentium 17.5/+80% 25# PIII 14.2/+60% 26# P4 18.6/+84% 27# Core2 9.56/+89% 4.83 28# Westmere 9.50/+45% 3.35 29# Sandy Bridge 10.5/+47% 3.20 30# Haswell 8.15/+50% 2.83 31# Skylake 7.53/+22% 2.75 32# Silvermont 17.4/+36% 8.35 33# Goldmont 13.4/+40% 4.36 34# Sledgehammer 10.2/+54% 35# Bulldozer 13.4/+50% 4.38(*) 36# 37# (*) Bulldozer actually executes 4xXOP code path that delivers 3.55; 38 39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 40push(@INC,"${dir}","${dir}../../perlasm"); 41require "x86asm.pl"; 42 43$output = pop and open STDOUT,">$output"; 44 45&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 46 47$xmm=$ymm=0; 48for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } 49 50$ymm=1 if ($xmm && 51 `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 52 =~ /GNU assembler version ([2-9]\.[0-9]+)/ && 53 ($gasver=$1)>=2.19); # first version supporting AVX 54 55$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && 56 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && 57 $1>=2.03); # first version supporting AVX 58 59$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" && 60 `ml 2>&1` =~ /Version ([0-9]+)\./ && 61 $1>=10); # first version supporting AVX 62 63$ymm=1 if ($xmm && !$ymm && 64 `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/ && 65 $2>=3.0); # first version supporting AVX 66 67$a="eax"; 68($b,$b_)=("ebx","ebp"); 69($c,$c_)=("ecx","esi"); 70($d,$d_)=("edx","edi"); 71 72sub QUARTERROUND { 73my ($ai,$bi,$ci,$di,$i)=@_; 74my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next 75my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous 76 77 # a b c d 78 # 79 # 0 4 8 12 < even round 80 # 1 5 9 13 81 # 2 6 10 14 82 # 3 7 11 15 83 # 0 5 10 15 < odd round 84 # 1 6 11 12 85 # 2 7 8 13 86 # 3 4 9 14 87 88 if ($i==0) { 89 my $j=4; 90 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); 91 } elsif ($i==3) { 92 my $j=0; 93 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); 94 } elsif ($i==4) { 95 my $j=4; 96 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); 97 } elsif ($i==7) { 98 my $j=0; 99 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); 100 } 101 102 #&add ($a,$b); # see elsewhere 103 &xor ($d,$a); 104 &mov (&DWP(4*$cp,"esp"),$c_) if ($ai>0 && $ai<3); 105 &rol ($d,16); 106 &mov (&DWP(4*$bp,"esp"),$b_) if ($i!=0); 107 &add ($c,$d); 108 &mov ($c_,&DWP(4*$cn,"esp")) if ($ai>0 && $ai<3); 109 &xor ($b,$c); 110 &mov ($d_,&DWP(4*$dn,"esp")) if ($di!=$dn); 111 &rol ($b,12); 112 &mov ($b_,&DWP(4*$bn,"esp")) if ($i<7); 113 &mov ($b_,&DWP(128,"esp")) if ($i==7); # loop counter 114 &add ($a,$b); 115 &xor ($d,$a); 116 &mov (&DWP(4*$ai,"esp"),$a); 117 &rol ($d,8); 118 &mov ($a,&DWP(4*$an,"esp")); 119 &add ($c,$d); 120 &mov (&DWP(4*$di,"esp"),$d) if ($di!=$dn); 121 &mov ($d_,$d) if ($di==$dn); 122 &xor ($b,$c); 123 &add ($a,$b_) if ($i<7); # elsewhere 124 &rol ($b,7); 125 126 ($b,$b_)=($b_,$b); 127 ($c,$c_)=($c_,$c); 128 ($d,$d_)=($d_,$d); 129} 130 131&static_label("ssse3_shortcut"); 132&static_label("xop_shortcut"); 133&static_label("ssse3_data"); 134&static_label("pic_point"); 135 136&function_begin("ChaCha20_ctr32"); 137 &xor ("eax","eax"); 138 &cmp ("eax",&wparam(2)); # len==0? 139 &je (&label("no_data")); 140if ($xmm) { 141 &call (&label("pic_point")); 142&set_label("pic_point"); 143 &blindpop("eax"); 144 &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point")); 145 &test (&DWP(0,"ebp"),1<<24); # test FXSR bit 146 &jz (&label("x86")); 147 &test (&DWP(4,"ebp"),1<<9); # test SSSE3 bit 148 &jz (&label("x86")); 149 &jmp (&label("ssse3_shortcut")); 150&set_label("x86"); 151} 152 &mov ("esi",&wparam(3)); # key 153 &mov ("edi",&wparam(4)); # counter and nonce 154 155 &stack_push(33); 156 157 &mov ("eax",&DWP(4*0,"esi")); # copy key 158 &mov ("ebx",&DWP(4*1,"esi")); 159 &mov ("ecx",&DWP(4*2,"esi")); 160 &mov ("edx",&DWP(4*3,"esi")); 161 &mov (&DWP(64+4*4,"esp"),"eax"); 162 &mov (&DWP(64+4*5,"esp"),"ebx"); 163 &mov (&DWP(64+4*6,"esp"),"ecx"); 164 &mov (&DWP(64+4*7,"esp"),"edx"); 165 &mov ("eax",&DWP(4*4,"esi")); 166 &mov ("ebx",&DWP(4*5,"esi")); 167 &mov ("ecx",&DWP(4*6,"esi")); 168 &mov ("edx",&DWP(4*7,"esi")); 169 &mov (&DWP(64+4*8,"esp"),"eax"); 170 &mov (&DWP(64+4*9,"esp"),"ebx"); 171 &mov (&DWP(64+4*10,"esp"),"ecx"); 172 &mov (&DWP(64+4*11,"esp"),"edx"); 173 &mov ("eax",&DWP(4*0,"edi")); # copy counter and nonce 174 &mov ("ebx",&DWP(4*1,"edi")); 175 &mov ("ecx",&DWP(4*2,"edi")); 176 &mov ("edx",&DWP(4*3,"edi")); 177 &sub ("eax",1); 178 &mov (&DWP(64+4*12,"esp"),"eax"); 179 &mov (&DWP(64+4*13,"esp"),"ebx"); 180 &mov (&DWP(64+4*14,"esp"),"ecx"); 181 &mov (&DWP(64+4*15,"esp"),"edx"); 182 &jmp (&label("entry")); 183 184&set_label("outer_loop",16); 185 &mov (&wparam(1),$b); # save input 186 &mov (&wparam(0),$a); # save output 187 &mov (&wparam(2),$c); # save len 188&set_label("entry"); 189 &mov ($a,0x61707865); 190 &mov (&DWP(4*1,"esp"),0x3320646e); 191 &mov (&DWP(4*2,"esp"),0x79622d32); 192 &mov (&DWP(4*3,"esp"),0x6b206574); 193 194 &mov ($b, &DWP(64+4*5,"esp")); # copy key material 195 &mov ($b_,&DWP(64+4*6,"esp")); 196 &mov ($c, &DWP(64+4*10,"esp")); 197 &mov ($c_,&DWP(64+4*11,"esp")); 198 &mov ($d, &DWP(64+4*13,"esp")); 199 &mov ($d_,&DWP(64+4*14,"esp")); 200 &mov (&DWP(4*5,"esp"),$b); 201 &mov (&DWP(4*6,"esp"),$b_); 202 &mov (&DWP(4*10,"esp"),$c); 203 &mov (&DWP(4*11,"esp"),$c_); 204 &mov (&DWP(4*13,"esp"),$d); 205 &mov (&DWP(4*14,"esp"),$d_); 206 207 &mov ($b, &DWP(64+4*7,"esp")); 208 &mov ($d_,&DWP(64+4*15,"esp")); 209 &mov ($d, &DWP(64+4*12,"esp")); 210 &mov ($b_,&DWP(64+4*4,"esp")); 211 &mov ($c, &DWP(64+4*8,"esp")); 212 &mov ($c_,&DWP(64+4*9,"esp")); 213 &add ($d,1); # counter value 214 &mov (&DWP(4*7,"esp"),$b); 215 &mov (&DWP(4*15,"esp"),$d_); 216 &mov (&DWP(64+4*12,"esp"),$d); # save counter value 217 218 &mov ($b,10); # loop counter 219 &jmp (&label("loop")); 220 221&set_label("loop",16); 222 &add ($a,$b_); # elsewhere 223 &mov (&DWP(128,"esp"),$b); # save loop counter 224 &mov ($b,$b_); 225 &QUARTERROUND(0, 4, 8, 12, 0); 226 &QUARTERROUND(1, 5, 9, 13, 1); 227 &QUARTERROUND(2, 6,10, 14, 2); 228 &QUARTERROUND(3, 7,11, 15, 3); 229 &QUARTERROUND(0, 5,10, 15, 4); 230 &QUARTERROUND(1, 6,11, 12, 5); 231 &QUARTERROUND(2, 7, 8, 13, 6); 232 &QUARTERROUND(3, 4, 9, 14, 7); 233 &dec ($b); 234 &jnz (&label("loop")); 235 236 &mov ($b,&wparam(2)); # load len 237 238 &add ($a,0x61707865); # accumulate key material 239 &add ($b_,&DWP(64+4*4,"esp")); 240 &add ($c, &DWP(64+4*8,"esp")); 241 &add ($c_,&DWP(64+4*9,"esp")); 242 243 &cmp ($b,64); 244 &jb (&label("tail")); 245 246 &mov ($b,&wparam(1)); # load input pointer 247 &add ($d, &DWP(64+4*12,"esp")); 248 &add ($d_,&DWP(64+4*14,"esp")); 249 250 &xor ($a, &DWP(4*0,$b)); # xor with input 251 &xor ($b_,&DWP(4*4,$b)); 252 &mov (&DWP(4*0,"esp"),$a); 253 &mov ($a,&wparam(0)); # load output pointer 254 &xor ($c, &DWP(4*8,$b)); 255 &xor ($c_,&DWP(4*9,$b)); 256 &xor ($d, &DWP(4*12,$b)); 257 &xor ($d_,&DWP(4*14,$b)); 258 &mov (&DWP(4*4,$a),$b_); # write output 259 &mov (&DWP(4*8,$a),$c); 260 &mov (&DWP(4*9,$a),$c_); 261 &mov (&DWP(4*12,$a),$d); 262 &mov (&DWP(4*14,$a),$d_); 263 264 &mov ($b_,&DWP(4*1,"esp")); 265 &mov ($c, &DWP(4*2,"esp")); 266 &mov ($c_,&DWP(4*3,"esp")); 267 &mov ($d, &DWP(4*5,"esp")); 268 &mov ($d_,&DWP(4*6,"esp")); 269 &add ($b_,0x3320646e); # accumulate key material 270 &add ($c, 0x79622d32); 271 &add ($c_,0x6b206574); 272 &add ($d, &DWP(64+4*5,"esp")); 273 &add ($d_,&DWP(64+4*6,"esp")); 274 &xor ($b_,&DWP(4*1,$b)); 275 &xor ($c, &DWP(4*2,$b)); 276 &xor ($c_,&DWP(4*3,$b)); 277 &xor ($d, &DWP(4*5,$b)); 278 &xor ($d_,&DWP(4*6,$b)); 279 &mov (&DWP(4*1,$a),$b_); 280 &mov (&DWP(4*2,$a),$c); 281 &mov (&DWP(4*3,$a),$c_); 282 &mov (&DWP(4*5,$a),$d); 283 &mov (&DWP(4*6,$a),$d_); 284 285 &mov ($b_,&DWP(4*7,"esp")); 286 &mov ($c, &DWP(4*10,"esp")); 287 &mov ($c_,&DWP(4*11,"esp")); 288 &mov ($d, &DWP(4*13,"esp")); 289 &mov ($d_,&DWP(4*15,"esp")); 290 &add ($b_,&DWP(64+4*7,"esp")); 291 &add ($c, &DWP(64+4*10,"esp")); 292 &add ($c_,&DWP(64+4*11,"esp")); 293 &add ($d, &DWP(64+4*13,"esp")); 294 &add ($d_,&DWP(64+4*15,"esp")); 295 &xor ($b_,&DWP(4*7,$b)); 296 &xor ($c, &DWP(4*10,$b)); 297 &xor ($c_,&DWP(4*11,$b)); 298 &xor ($d, &DWP(4*13,$b)); 299 &xor ($d_,&DWP(4*15,$b)); 300 &lea ($b,&DWP(4*16,$b)); 301 &mov (&DWP(4*7,$a),$b_); 302 &mov ($b_,&DWP(4*0,"esp")); 303 &mov (&DWP(4*10,$a),$c); 304 &mov ($c,&wparam(2)); # len 305 &mov (&DWP(4*11,$a),$c_); 306 &mov (&DWP(4*13,$a),$d); 307 &mov (&DWP(4*15,$a),$d_); 308 &mov (&DWP(4*0,$a),$b_); 309 &lea ($a,&DWP(4*16,$a)); 310 &sub ($c,64); 311 &jnz (&label("outer_loop")); 312 313 &jmp (&label("done")); 314 315&set_label("tail"); 316 &add ($d, &DWP(64+4*12,"esp")); 317 &add ($d_,&DWP(64+4*14,"esp")); 318 &mov (&DWP(4*0,"esp"),$a); 319 &mov (&DWP(4*4,"esp"),$b_); 320 &mov (&DWP(4*8,"esp"),$c); 321 &mov (&DWP(4*9,"esp"),$c_); 322 &mov (&DWP(4*12,"esp"),$d); 323 &mov (&DWP(4*14,"esp"),$d_); 324 325 &mov ($b_,&DWP(4*1,"esp")); 326 &mov ($c, &DWP(4*2,"esp")); 327 &mov ($c_,&DWP(4*3,"esp")); 328 &mov ($d, &DWP(4*5,"esp")); 329 &mov ($d_,&DWP(4*6,"esp")); 330 &add ($b_,0x3320646e); # accumulate key material 331 &add ($c, 0x79622d32); 332 &add ($c_,0x6b206574); 333 &add ($d, &DWP(64+4*5,"esp")); 334 &add ($d_,&DWP(64+4*6,"esp")); 335 &mov (&DWP(4*1,"esp"),$b_); 336 &mov (&DWP(4*2,"esp"),$c); 337 &mov (&DWP(4*3,"esp"),$c_); 338 &mov (&DWP(4*5,"esp"),$d); 339 &mov (&DWP(4*6,"esp"),$d_); 340 341 &mov ($b_,&DWP(4*7,"esp")); 342 &mov ($c, &DWP(4*10,"esp")); 343 &mov ($c_,&DWP(4*11,"esp")); 344 &mov ($d, &DWP(4*13,"esp")); 345 &mov ($d_,&DWP(4*15,"esp")); 346 &add ($b_,&DWP(64+4*7,"esp")); 347 &add ($c, &DWP(64+4*10,"esp")); 348 &add ($c_,&DWP(64+4*11,"esp")); 349 &add ($d, &DWP(64+4*13,"esp")); 350 &add ($d_,&DWP(64+4*15,"esp")); 351 &mov (&DWP(4*7,"esp"),$b_); 352 &mov ($b_,&wparam(1)); # load input 353 &mov (&DWP(4*10,"esp"),$c); 354 &mov ($c,&wparam(0)); # load output 355 &mov (&DWP(4*11,"esp"),$c_); 356 &xor ($c_,$c_); 357 &mov (&DWP(4*13,"esp"),$d); 358 &mov (&DWP(4*15,"esp"),$d_); 359 360 &xor ("eax","eax"); 361 &xor ("edx","edx"); 362&set_label("tail_loop"); 363 &movb ("al",&BP(0,$c_,$b_)); 364 &movb ("dl",&BP(0,"esp",$c_)); 365 &lea ($c_,&DWP(1,$c_)); 366 &xor ("al","dl"); 367 &mov (&BP(-1,$c,$c_),"al"); 368 &dec ($b); 369 &jnz (&label("tail_loop")); 370 371&set_label("done"); 372 &stack_pop(33); 373&set_label("no_data"); 374&function_end("ChaCha20_ctr32"); 375 376if ($xmm) { 377my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7)); 378my ($out,$inp,$len)=("edi","esi","ecx"); 379 380sub QUARTERROUND_SSSE3 { 381my ($ai,$bi,$ci,$di,$i)=@_; 382my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next 383my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous 384 385 # a b c d 386 # 387 # 0 4 8 12 < even round 388 # 1 5 9 13 389 # 2 6 10 14 390 # 3 7 11 15 391 # 0 5 10 15 < odd round 392 # 1 6 11 12 393 # 2 7 8 13 394 # 3 4 9 14 395 396 if ($i==0) { 397 my $j=4; 398 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); 399 } elsif ($i==3) { 400 my $j=0; 401 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); 402 } elsif ($i==4) { 403 my $j=4; 404 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); 405 } elsif ($i==7) { 406 my $j=0; 407 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); 408 } 409 410 #&paddd ($xa,$xb); # see elsewhere 411 #&pxor ($xd,$xa); # see elsewhere 412 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3); 413 &pshufb ($xd,&QWP(0,"eax")); # rot16 414 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0); 415 &paddd ($xc,$xd); 416 &movdqa($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3); 417 &pxor ($xb,$xc); 418 &movdqa($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7); 419 &movdqa ($xa_,$xb); # borrow as temporary 420 &pslld ($xb,12); 421 &psrld ($xa_,20); 422 &por ($xb,$xa_); 423 &movdqa($xa_,&QWP(16*$an-128,"ebx")); 424 &paddd ($xa,$xb); 425 &movdqa($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn); 426 &pxor ($xd,$xa); 427 &movdqa (&QWP(16*$ai-128,"ebx"),$xa); 428 &pshufb ($xd,&QWP(16,"eax")); # rot8 429 &paddd ($xc,$xd); 430 &movdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn); 431 &movdqa ($xd_,$xd) if ($di==$dn); 432 &pxor ($xb,$xc); 433 &paddd ($xa_,$xb_) if ($i<7); # elsewhere 434 &movdqa ($xa,$xb); # borrow as temporary 435 &pslld ($xb,7); 436 &psrld ($xa,25); 437 &pxor ($xd_,$xa_) if ($i<7); # elsewhere 438 &por ($xb,$xa); 439 440 ($xa,$xa_)=($xa_,$xa); 441 ($xb,$xb_)=($xb_,$xb); 442 ($xc,$xc_)=($xc_,$xc); 443 ($xd,$xd_)=($xd_,$xd); 444} 445 446&function_begin("ChaCha20_ssse3"); 447&set_label("ssse3_shortcut"); 448if ($ymm) { 449 &test (&DWP(4,"ebp"),1<<11); # test XOP bit 450 &jnz (&label("xop_shortcut")); 451} 452 453 &mov ($out,&wparam(0)); 454 &mov ($inp,&wparam(1)); 455 &mov ($len,&wparam(2)); 456 &mov ("edx",&wparam(3)); # key 457 &mov ("ebx",&wparam(4)); # counter and nonce 458 459 &mov ("ebp","esp"); 460 &stack_push (131); 461 &and ("esp",-64); 462 &mov (&DWP(512,"esp"),"ebp"); 463 464 &lea ("eax",&DWP(&label("ssse3_data")."-". 465 &label("pic_point"),"eax")); 466 &movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce 467 468if (defined($gasver) && $gasver>=2.17) { # even though we encode 469 # pshufb manually, we 470 # handle only register 471 # operands, while this 472 # segment uses memory 473 # operand... 474 &cmp ($len,64*4); 475 &jb (&label("1x")); 476 477 &mov (&DWP(512+4,"esp"),"edx"); # offload pointers 478 &mov (&DWP(512+8,"esp"),"ebx"); 479 &sub ($len,64*4); # bias len 480 &lea ("ebp",&DWP(256+128,"esp")); # size optimization 481 482 &movdqu ("xmm7",&QWP(0,"edx")); # key 483 &pshufd ("xmm0","xmm3",0x00); 484 &pshufd ("xmm1","xmm3",0x55); 485 &pshufd ("xmm2","xmm3",0xaa); 486 &pshufd ("xmm3","xmm3",0xff); 487 &paddd ("xmm0",&QWP(16*3,"eax")); # fix counters 488 &pshufd ("xmm4","xmm7",0x00); 489 &pshufd ("xmm5","xmm7",0x55); 490 &psubd ("xmm0",&QWP(16*4,"eax")); 491 &pshufd ("xmm6","xmm7",0xaa); 492 &pshufd ("xmm7","xmm7",0xff); 493 &movdqa (&QWP(16*12-128,"ebp"),"xmm0"); 494 &movdqa (&QWP(16*13-128,"ebp"),"xmm1"); 495 &movdqa (&QWP(16*14-128,"ebp"),"xmm2"); 496 &movdqa (&QWP(16*15-128,"ebp"),"xmm3"); 497 &movdqu ("xmm3",&QWP(16,"edx")); # key 498 &movdqa (&QWP(16*4-128,"ebp"),"xmm4"); 499 &movdqa (&QWP(16*5-128,"ebp"),"xmm5"); 500 &movdqa (&QWP(16*6-128,"ebp"),"xmm6"); 501 &movdqa (&QWP(16*7-128,"ebp"),"xmm7"); 502 &movdqa ("xmm7",&QWP(16*2,"eax")); # sigma 503 &lea ("ebx",&DWP(128,"esp")); # size optimization 504 505 &pshufd ("xmm0","xmm3",0x00); 506 &pshufd ("xmm1","xmm3",0x55); 507 &pshufd ("xmm2","xmm3",0xaa); 508 &pshufd ("xmm3","xmm3",0xff); 509 &pshufd ("xmm4","xmm7",0x00); 510 &pshufd ("xmm5","xmm7",0x55); 511 &pshufd ("xmm6","xmm7",0xaa); 512 &pshufd ("xmm7","xmm7",0xff); 513 &movdqa (&QWP(16*8-128,"ebp"),"xmm0"); 514 &movdqa (&QWP(16*9-128,"ebp"),"xmm1"); 515 &movdqa (&QWP(16*10-128,"ebp"),"xmm2"); 516 &movdqa (&QWP(16*11-128,"ebp"),"xmm3"); 517 &movdqa (&QWP(16*0-128,"ebp"),"xmm4"); 518 &movdqa (&QWP(16*1-128,"ebp"),"xmm5"); 519 &movdqa (&QWP(16*2-128,"ebp"),"xmm6"); 520 &movdqa (&QWP(16*3-128,"ebp"),"xmm7"); 521 522 &lea ($inp,&DWP(128,$inp)); # size optimization 523 &lea ($out,&DWP(128,$out)); # size optimization 524 &jmp (&label("outer_loop")); 525 526&set_label("outer_loop",16); 527 #&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material 528 &movdqa ("xmm1",&QWP(16*1-128,"ebp")); 529 &movdqa ("xmm2",&QWP(16*2-128,"ebp")); 530 &movdqa ("xmm3",&QWP(16*3-128,"ebp")); 531 #&movdqa ("xmm4",&QWP(16*4-128,"ebp")); 532 &movdqa ("xmm5",&QWP(16*5-128,"ebp")); 533 &movdqa ("xmm6",&QWP(16*6-128,"ebp")); 534 &movdqa ("xmm7",&QWP(16*7-128,"ebp")); 535 #&movdqa (&QWP(16*0-128,"ebx"),"xmm0"); 536 &movdqa (&QWP(16*1-128,"ebx"),"xmm1"); 537 &movdqa (&QWP(16*2-128,"ebx"),"xmm2"); 538 &movdqa (&QWP(16*3-128,"ebx"),"xmm3"); 539 #&movdqa (&QWP(16*4-128,"ebx"),"xmm4"); 540 &movdqa (&QWP(16*5-128,"ebx"),"xmm5"); 541 &movdqa (&QWP(16*6-128,"ebx"),"xmm6"); 542 &movdqa (&QWP(16*7-128,"ebx"),"xmm7"); 543 #&movdqa ("xmm0",&QWP(16*8-128,"ebp")); 544 #&movdqa ("xmm1",&QWP(16*9-128,"ebp")); 545 &movdqa ("xmm2",&QWP(16*10-128,"ebp")); 546 &movdqa ("xmm3",&QWP(16*11-128,"ebp")); 547 &movdqa ("xmm4",&QWP(16*12-128,"ebp")); 548 &movdqa ("xmm5",&QWP(16*13-128,"ebp")); 549 &movdqa ("xmm6",&QWP(16*14-128,"ebp")); 550 &movdqa ("xmm7",&QWP(16*15-128,"ebp")); 551 &paddd ("xmm4",&QWP(16*4,"eax")); # counter value 552 #&movdqa (&QWP(16*8-128,"ebx"),"xmm0"); 553 #&movdqa (&QWP(16*9-128,"ebx"),"xmm1"); 554 &movdqa (&QWP(16*10-128,"ebx"),"xmm2"); 555 &movdqa (&QWP(16*11-128,"ebx"),"xmm3"); 556 &movdqa (&QWP(16*12-128,"ebx"),"xmm4"); 557 &movdqa (&QWP(16*13-128,"ebx"),"xmm5"); 558 &movdqa (&QWP(16*14-128,"ebx"),"xmm6"); 559 &movdqa (&QWP(16*15-128,"ebx"),"xmm7"); 560 &movdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value 561 562 &movdqa ($xa, &QWP(16*0-128,"ebp")); 563 &movdqa ($xd, "xmm4"); 564 &movdqa ($xb_,&QWP(16*4-128,"ebp")); 565 &movdqa ($xc, &QWP(16*8-128,"ebp")); 566 &movdqa ($xc_,&QWP(16*9-128,"ebp")); 567 568 &mov ("edx",10); # loop counter 569 &nop (); 570 571&set_label("loop",16); 572 &paddd ($xa,$xb_); # elsewhere 573 &movdqa ($xb,$xb_); 574 &pxor ($xd,$xa); # elsewhere 575 &QUARTERROUND_SSSE3(0, 4, 8, 12, 0); 576 &QUARTERROUND_SSSE3(1, 5, 9, 13, 1); 577 &QUARTERROUND_SSSE3(2, 6,10, 14, 2); 578 &QUARTERROUND_SSSE3(3, 7,11, 15, 3); 579 &QUARTERROUND_SSSE3(0, 5,10, 15, 4); 580 &QUARTERROUND_SSSE3(1, 6,11, 12, 5); 581 &QUARTERROUND_SSSE3(2, 7, 8, 13, 6); 582 &QUARTERROUND_SSSE3(3, 4, 9, 14, 7); 583 &dec ("edx"); 584 &jnz (&label("loop")); 585 586 &movdqa (&QWP(16*4-128,"ebx"),$xb_); 587 &movdqa (&QWP(16*8-128,"ebx"),$xc); 588 &movdqa (&QWP(16*9-128,"ebx"),$xc_); 589 &movdqa (&QWP(16*12-128,"ebx"),$xd); 590 &movdqa (&QWP(16*14-128,"ebx"),$xd_); 591 592 my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); 593 594 #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there 595 &movdqa ($xa1,&QWP(16*1-128,"ebx")); 596 &movdqa ($xa2,&QWP(16*2-128,"ebx")); 597 &movdqa ($xa3,&QWP(16*3-128,"ebx")); 598 599 for($i=0;$i<256;$i+=64) { 600 &paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material 601 &paddd ($xa1,&QWP($i+16*1-128,"ebp")); 602 &paddd ($xa2,&QWP($i+16*2-128,"ebp")); 603 &paddd ($xa3,&QWP($i+16*3-128,"ebp")); 604 605 &movdqa ($xt2,$xa0); # "de-interlace" data 606 &punpckldq ($xa0,$xa1); 607 &movdqa ($xt3,$xa2); 608 &punpckldq ($xa2,$xa3); 609 &punpckhdq ($xt2,$xa1); 610 &punpckhdq ($xt3,$xa3); 611 &movdqa ($xa1,$xa0); 612 &punpcklqdq ($xa0,$xa2); # "a0" 613 &movdqa ($xa3,$xt2); 614 &punpcklqdq ($xt2,$xt3); # "a2" 615 &punpckhqdq ($xa1,$xa2); # "a1" 616 &punpckhqdq ($xa3,$xt3); # "a3" 617 618 #($xa2,$xt2)=($xt2,$xa2); 619 620 &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input 621 &movdqu ($xt1,&QWP(64*1-128,$inp)); 622 &movdqu ($xa2,&QWP(64*2-128,$inp)); 623 &movdqu ($xt3,&QWP(64*3-128,$inp)); 624 &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); 625 &pxor ($xt0,$xa0); 626 &movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); 627 &pxor ($xt1,$xa1); 628 &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); 629 &pxor ($xt2,$xa2); 630 &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); 631 &pxor ($xt3,$xa3); 632 &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); 633 &movdqu (&QWP(64*0-128,$out),$xt0); # store output 634 &movdqu (&QWP(64*1-128,$out),$xt1); 635 &movdqu (&QWP(64*2-128,$out),$xt2); 636 &movdqu (&QWP(64*3-128,$out),$xt3); 637 &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); 638 } 639 &sub ($len,64*4); 640 &jnc (&label("outer_loop")); 641 642 &add ($len,64*4); 643 &jz (&label("done")); 644 645 &mov ("ebx",&DWP(512+8,"esp")); # restore pointers 646 &lea ($inp,&DWP(-128,$inp)); 647 &mov ("edx",&DWP(512+4,"esp")); 648 &lea ($out,&DWP(-128,$out)); 649 650 &movd ("xmm2",&DWP(16*12-128,"ebp")); # counter value 651 &movdqu ("xmm3",&QWP(0,"ebx")); 652 &paddd ("xmm2",&QWP(16*6,"eax")); # +four 653 &pand ("xmm3",&QWP(16*7,"eax")); 654 &por ("xmm3","xmm2"); # counter value 655} 656{ 657my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); 658 659sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round 660 &paddd ($a,$b); 661 &pxor ($d,$a); 662 &pshufb ($d,$rot16); 663 664 &paddd ($c,$d); 665 &pxor ($b,$c); 666 &movdqa ($t,$b); 667 &psrld ($b,20); 668 &pslld ($t,12); 669 &por ($b,$t); 670 671 &paddd ($a,$b); 672 &pxor ($d,$a); 673 &pshufb ($d,$rot24); 674 675 &paddd ($c,$d); 676 &pxor ($b,$c); 677 &movdqa ($t,$b); 678 &psrld ($b,25); 679 &pslld ($t,7); 680 &por ($b,$t); 681} 682 683&set_label("1x"); 684 &movdqa ($a,&QWP(16*2,"eax")); # sigma 685 &movdqu ($b,&QWP(0,"edx")); 686 &movdqu ($c,&QWP(16,"edx")); 687 #&movdqu ($d,&QWP(0,"ebx")); # already loaded 688 &movdqa ($rot16,&QWP(0,"eax")); 689 &movdqa ($rot24,&QWP(16,"eax")); 690 &mov (&DWP(16*3,"esp"),"ebp"); 691 692 &movdqa (&QWP(16*0,"esp"),$a); 693 &movdqa (&QWP(16*1,"esp"),$b); 694 &movdqa (&QWP(16*2,"esp"),$c); 695 &movdqa (&QWP(16*3,"esp"),$d); 696 &mov ("edx",10); 697 &jmp (&label("loop1x")); 698 699&set_label("outer1x",16); 700 &movdqa ($d,&QWP(16*5,"eax")); # one 701 &movdqa ($a,&QWP(16*0,"esp")); 702 &movdqa ($b,&QWP(16*1,"esp")); 703 &movdqa ($c,&QWP(16*2,"esp")); 704 &paddd ($d,&QWP(16*3,"esp")); 705 &mov ("edx",10); 706 &movdqa (&QWP(16*3,"esp"),$d); 707 &jmp (&label("loop1x")); 708 709&set_label("loop1x",16); 710 &SSSE3ROUND(); 711 &pshufd ($c,$c,0b01001110); 712 &pshufd ($b,$b,0b00111001); 713 &pshufd ($d,$d,0b10010011); 714 &nop (); 715 716 &SSSE3ROUND(); 717 &pshufd ($c,$c,0b01001110); 718 &pshufd ($b,$b,0b10010011); 719 &pshufd ($d,$d,0b00111001); 720 721 &dec ("edx"); 722 &jnz (&label("loop1x")); 723 724 &paddd ($a,&QWP(16*0,"esp")); 725 &paddd ($b,&QWP(16*1,"esp")); 726 &paddd ($c,&QWP(16*2,"esp")); 727 &paddd ($d,&QWP(16*3,"esp")); 728 729 &cmp ($len,64); 730 &jb (&label("tail")); 731 732 &movdqu ($t,&QWP(16*0,$inp)); 733 &movdqu ($t1,&QWP(16*1,$inp)); 734 &pxor ($a,$t); # xor with input 735 &movdqu ($t,&QWP(16*2,$inp)); 736 &pxor ($b,$t1); 737 &movdqu ($t1,&QWP(16*3,$inp)); 738 &pxor ($c,$t); 739 &pxor ($d,$t1); 740 &lea ($inp,&DWP(16*4,$inp)); # inp+=64 741 742 &movdqu (&QWP(16*0,$out),$a); # write output 743 &movdqu (&QWP(16*1,$out),$b); 744 &movdqu (&QWP(16*2,$out),$c); 745 &movdqu (&QWP(16*3,$out),$d); 746 &lea ($out,&DWP(16*4,$out)); # inp+=64 747 748 &sub ($len,64); 749 &jnz (&label("outer1x")); 750 751 &jmp (&label("done")); 752 753&set_label("tail"); 754 &movdqa (&QWP(16*0,"esp"),$a); 755 &movdqa (&QWP(16*1,"esp"),$b); 756 &movdqa (&QWP(16*2,"esp"),$c); 757 &movdqa (&QWP(16*3,"esp"),$d); 758 759 &xor ("eax","eax"); 760 &xor ("edx","edx"); 761 &xor ("ebp","ebp"); 762 763&set_label("tail_loop"); 764 &movb ("al",&BP(0,"esp","ebp")); 765 &movb ("dl",&BP(0,$inp,"ebp")); 766 &lea ("ebp",&DWP(1,"ebp")); 767 &xor ("al","dl"); 768 &movb (&BP(-1,$out,"ebp"),"al"); 769 &dec ($len); 770 &jnz (&label("tail_loop")); 771} 772&set_label("done"); 773 &mov ("esp",&DWP(512,"esp")); 774&function_end("ChaCha20_ssse3"); 775 776&align (64); 777&set_label("ssse3_data"); 778&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd); 779&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe); 780&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574); 781&data_word(0,1,2,3); 782&data_word(4,4,4,4); 783&data_word(1,0,0,0); 784&data_word(4,0,0,0); 785&data_word(0,-1,-1,-1); 786&align (64); 787} 788&asciz ("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>"); 789 790if ($ymm) { 791my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7)); 792my ($out,$inp,$len)=("edi","esi","ecx"); 793 794sub QUARTERROUND_XOP { 795my ($ai,$bi,$ci,$di,$i)=@_; 796my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next 797my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous 798 799 # a b c d 800 # 801 # 0 4 8 12 < even round 802 # 1 5 9 13 803 # 2 6 10 14 804 # 3 7 11 15 805 # 0 5 10 15 < odd round 806 # 1 6 11 12 807 # 2 7 8 13 808 # 3 4 9 14 809 810 if ($i==0) { 811 my $j=4; 812 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); 813 } elsif ($i==3) { 814 my $j=0; 815 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); 816 } elsif ($i==4) { 817 my $j=4; 818 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); 819 } elsif ($i==7) { 820 my $j=0; 821 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); 822 } 823 824 #&vpaddd ($xa,$xa,$xb); # see elsewhere 825 #&vpxor ($xd,$xd,$xa); # see elsewhere 826 &vmovdqa (&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3); 827 &vprotd ($xd,$xd,16); 828 &vmovdqa (&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0); 829 &vpaddd ($xc,$xc,$xd); 830 &vmovdqa ($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3); 831 &vpxor ($xb,$i!=0?$xb:$xb_,$xc); 832 &vmovdqa ($xa_,&QWP(16*$an-128,"ebx")); 833 &vprotd ($xb,$xb,12); 834 &vmovdqa ($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7); 835 &vpaddd ($xa,$xa,$xb); 836 &vmovdqa ($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn); 837 &vpxor ($xd,$xd,$xa); 838 &vpaddd ($xa_,$xa_,$xb_) if ($i<7); # elsewhere 839 &vprotd ($xd,$xd,8); 840 &vmovdqa (&QWP(16*$ai-128,"ebx"),$xa); 841 &vpaddd ($xc,$xc,$xd); 842 &vmovdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn); 843 &vpxor ($xb,$xb,$xc); 844 &vpxor ($xd_,$di==$dn?$xd:$xd_,$xa_) if ($i<7); # elsewhere 845 &vprotd ($xb,$xb,7); 846 847 ($xa,$xa_)=($xa_,$xa); 848 ($xb,$xb_)=($xb_,$xb); 849 ($xc,$xc_)=($xc_,$xc); 850 ($xd,$xd_)=($xd_,$xd); 851} 852 853&function_begin("ChaCha20_xop"); 854&set_label("xop_shortcut"); 855 &mov ($out,&wparam(0)); 856 &mov ($inp,&wparam(1)); 857 &mov ($len,&wparam(2)); 858 &mov ("edx",&wparam(3)); # key 859 &mov ("ebx",&wparam(4)); # counter and nonce 860 &vzeroupper (); 861 862 &mov ("ebp","esp"); 863 &stack_push (131); 864 &and ("esp",-64); 865 &mov (&DWP(512,"esp"),"ebp"); 866 867 &lea ("eax",&DWP(&label("ssse3_data")."-". 868 &label("pic_point"),"eax")); 869 &vmovdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce 870 871 &cmp ($len,64*4); 872 &jb (&label("1x")); 873 874 &mov (&DWP(512+4,"esp"),"edx"); # offload pointers 875 &mov (&DWP(512+8,"esp"),"ebx"); 876 &sub ($len,64*4); # bias len 877 &lea ("ebp",&DWP(256+128,"esp")); # size optimization 878 879 &vmovdqu ("xmm7",&QWP(0,"edx")); # key 880 &vpshufd ("xmm0","xmm3",0x00); 881 &vpshufd ("xmm1","xmm3",0x55); 882 &vpshufd ("xmm2","xmm3",0xaa); 883 &vpshufd ("xmm3","xmm3",0xff); 884 &vpaddd ("xmm0","xmm0",&QWP(16*3,"eax")); # fix counters 885 &vpshufd ("xmm4","xmm7",0x00); 886 &vpshufd ("xmm5","xmm7",0x55); 887 &vpsubd ("xmm0","xmm0",&QWP(16*4,"eax")); 888 &vpshufd ("xmm6","xmm7",0xaa); 889 &vpshufd ("xmm7","xmm7",0xff); 890 &vmovdqa (&QWP(16*12-128,"ebp"),"xmm0"); 891 &vmovdqa (&QWP(16*13-128,"ebp"),"xmm1"); 892 &vmovdqa (&QWP(16*14-128,"ebp"),"xmm2"); 893 &vmovdqa (&QWP(16*15-128,"ebp"),"xmm3"); 894 &vmovdqu ("xmm3",&QWP(16,"edx")); # key 895 &vmovdqa (&QWP(16*4-128,"ebp"),"xmm4"); 896 &vmovdqa (&QWP(16*5-128,"ebp"),"xmm5"); 897 &vmovdqa (&QWP(16*6-128,"ebp"),"xmm6"); 898 &vmovdqa (&QWP(16*7-128,"ebp"),"xmm7"); 899 &vmovdqa ("xmm7",&QWP(16*2,"eax")); # sigma 900 &lea ("ebx",&DWP(128,"esp")); # size optimization 901 902 &vpshufd ("xmm0","xmm3",0x00); 903 &vpshufd ("xmm1","xmm3",0x55); 904 &vpshufd ("xmm2","xmm3",0xaa); 905 &vpshufd ("xmm3","xmm3",0xff); 906 &vpshufd ("xmm4","xmm7",0x00); 907 &vpshufd ("xmm5","xmm7",0x55); 908 &vpshufd ("xmm6","xmm7",0xaa); 909 &vpshufd ("xmm7","xmm7",0xff); 910 &vmovdqa (&QWP(16*8-128,"ebp"),"xmm0"); 911 &vmovdqa (&QWP(16*9-128,"ebp"),"xmm1"); 912 &vmovdqa (&QWP(16*10-128,"ebp"),"xmm2"); 913 &vmovdqa (&QWP(16*11-128,"ebp"),"xmm3"); 914 &vmovdqa (&QWP(16*0-128,"ebp"),"xmm4"); 915 &vmovdqa (&QWP(16*1-128,"ebp"),"xmm5"); 916 &vmovdqa (&QWP(16*2-128,"ebp"),"xmm6"); 917 &vmovdqa (&QWP(16*3-128,"ebp"),"xmm7"); 918 919 &lea ($inp,&DWP(128,$inp)); # size optimization 920 &lea ($out,&DWP(128,$out)); # size optimization 921 &jmp (&label("outer_loop")); 922 923&set_label("outer_loop",32); 924 #&vmovdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material 925 &vmovdqa ("xmm1",&QWP(16*1-128,"ebp")); 926 &vmovdqa ("xmm2",&QWP(16*2-128,"ebp")); 927 &vmovdqa ("xmm3",&QWP(16*3-128,"ebp")); 928 #&vmovdqa ("xmm4",&QWP(16*4-128,"ebp")); 929 &vmovdqa ("xmm5",&QWP(16*5-128,"ebp")); 930 &vmovdqa ("xmm6",&QWP(16*6-128,"ebp")); 931 &vmovdqa ("xmm7",&QWP(16*7-128,"ebp")); 932 #&vmovdqa (&QWP(16*0-128,"ebx"),"xmm0"); 933 &vmovdqa (&QWP(16*1-128,"ebx"),"xmm1"); 934 &vmovdqa (&QWP(16*2-128,"ebx"),"xmm2"); 935 &vmovdqa (&QWP(16*3-128,"ebx"),"xmm3"); 936 #&vmovdqa (&QWP(16*4-128,"ebx"),"xmm4"); 937 &vmovdqa (&QWP(16*5-128,"ebx"),"xmm5"); 938 &vmovdqa (&QWP(16*6-128,"ebx"),"xmm6"); 939 &vmovdqa (&QWP(16*7-128,"ebx"),"xmm7"); 940 #&vmovdqa ("xmm0",&QWP(16*8-128,"ebp")); 941 #&vmovdqa ("xmm1",&QWP(16*9-128,"ebp")); 942 &vmovdqa ("xmm2",&QWP(16*10-128,"ebp")); 943 &vmovdqa ("xmm3",&QWP(16*11-128,"ebp")); 944 &vmovdqa ("xmm4",&QWP(16*12-128,"ebp")); 945 &vmovdqa ("xmm5",&QWP(16*13-128,"ebp")); 946 &vmovdqa ("xmm6",&QWP(16*14-128,"ebp")); 947 &vmovdqa ("xmm7",&QWP(16*15-128,"ebp")); 948 &vpaddd ("xmm4","xmm4",&QWP(16*4,"eax")); # counter value 949 #&vmovdqa (&QWP(16*8-128,"ebx"),"xmm0"); 950 #&vmovdqa (&QWP(16*9-128,"ebx"),"xmm1"); 951 &vmovdqa (&QWP(16*10-128,"ebx"),"xmm2"); 952 &vmovdqa (&QWP(16*11-128,"ebx"),"xmm3"); 953 &vmovdqa (&QWP(16*12-128,"ebx"),"xmm4"); 954 &vmovdqa (&QWP(16*13-128,"ebx"),"xmm5"); 955 &vmovdqa (&QWP(16*14-128,"ebx"),"xmm6"); 956 &vmovdqa (&QWP(16*15-128,"ebx"),"xmm7"); 957 &vmovdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value 958 959 &vmovdqa ($xa, &QWP(16*0-128,"ebp")); 960 &vmovdqa ($xd, "xmm4"); 961 &vmovdqa ($xb_,&QWP(16*4-128,"ebp")); 962 &vmovdqa ($xc, &QWP(16*8-128,"ebp")); 963 &vmovdqa ($xc_,&QWP(16*9-128,"ebp")); 964 965 &mov ("edx",10); # loop counter 966 &nop (); 967 968&set_label("loop",32); 969 &vpaddd ($xa,$xa,$xb_); # elsewhere 970 &vpxor ($xd,$xd,$xa); # elsewhere 971 &QUARTERROUND_XOP(0, 4, 8, 12, 0); 972 &QUARTERROUND_XOP(1, 5, 9, 13, 1); 973 &QUARTERROUND_XOP(2, 6,10, 14, 2); 974 &QUARTERROUND_XOP(3, 7,11, 15, 3); 975 &QUARTERROUND_XOP(0, 5,10, 15, 4); 976 &QUARTERROUND_XOP(1, 6,11, 12, 5); 977 &QUARTERROUND_XOP(2, 7, 8, 13, 6); 978 &QUARTERROUND_XOP(3, 4, 9, 14, 7); 979 &dec ("edx"); 980 &jnz (&label("loop")); 981 982 &vmovdqa (&QWP(16*4-128,"ebx"),$xb_); 983 &vmovdqa (&QWP(16*8-128,"ebx"),$xc); 984 &vmovdqa (&QWP(16*9-128,"ebx"),$xc_); 985 &vmovdqa (&QWP(16*12-128,"ebx"),$xd); 986 &vmovdqa (&QWP(16*14-128,"ebx"),$xd_); 987 988 my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); 989 990 #&vmovdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there 991 &vmovdqa ($xa1,&QWP(16*1-128,"ebx")); 992 &vmovdqa ($xa2,&QWP(16*2-128,"ebx")); 993 &vmovdqa ($xa3,&QWP(16*3-128,"ebx")); 994 995 for($i=0;$i<256;$i+=64) { 996 &vpaddd ($xa0,$xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material 997 &vpaddd ($xa1,$xa1,&QWP($i+16*1-128,"ebp")); 998 &vpaddd ($xa2,$xa2,&QWP($i+16*2-128,"ebp")); 999 &vpaddd ($xa3,$xa3,&QWP($i+16*3-128,"ebp")); 1000 1001 &vpunpckldq ($xt2,$xa0,$xa1); # "de-interlace" data 1002 &vpunpckldq ($xt3,$xa2,$xa3); 1003 &vpunpckhdq ($xa0,$xa0,$xa1); 1004 &vpunpckhdq ($xa2,$xa2,$xa3); 1005 &vpunpcklqdq ($xa1,$xt2,$xt3); # "a0" 1006 &vpunpckhqdq ($xt2,$xt2,$xt3); # "a1" 1007 &vpunpcklqdq ($xt3,$xa0,$xa2); # "a2" 1008 &vpunpckhqdq ($xa3,$xa0,$xa2); # "a3" 1009 1010 &vpxor ($xt0,$xa1,&QWP(64*0-128,$inp)); 1011 &vpxor ($xt1,$xt2,&QWP(64*1-128,$inp)); 1012 &vpxor ($xt2,$xt3,&QWP(64*2-128,$inp)); 1013 &vpxor ($xt3,$xa3,&QWP(64*3-128,$inp)); 1014 &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); 1015 &vmovdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); 1016 &vmovdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); 1017 &vmovdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); 1018 &vmovdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); 1019 &vmovdqu (&QWP(64*0-128,$out),$xt0); # store output 1020 &vmovdqu (&QWP(64*1-128,$out),$xt1); 1021 &vmovdqu (&QWP(64*2-128,$out),$xt2); 1022 &vmovdqu (&QWP(64*3-128,$out),$xt3); 1023 &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); 1024 } 1025 &sub ($len,64*4); 1026 &jnc (&label("outer_loop")); 1027 1028 &add ($len,64*4); 1029 &jz (&label("done")); 1030 1031 &mov ("ebx",&DWP(512+8,"esp")); # restore pointers 1032 &lea ($inp,&DWP(-128,$inp)); 1033 &mov ("edx",&DWP(512+4,"esp")); 1034 &lea ($out,&DWP(-128,$out)); 1035 1036 &vmovd ("xmm2",&DWP(16*12-128,"ebp")); # counter value 1037 &vmovdqu ("xmm3",&QWP(0,"ebx")); 1038 &vpaddd ("xmm2","xmm2",&QWP(16*6,"eax"));# +four 1039 &vpand ("xmm3","xmm3",&QWP(16*7,"eax")); 1040 &vpor ("xmm3","xmm3","xmm2"); # counter value 1041{ 1042my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); 1043 1044sub XOPROUND { 1045 &vpaddd ($a,$a,$b); 1046 &vpxor ($d,$d,$a); 1047 &vprotd ($d,$d,16); 1048 1049 &vpaddd ($c,$c,$d); 1050 &vpxor ($b,$b,$c); 1051 &vprotd ($b,$b,12); 1052 1053 &vpaddd ($a,$a,$b); 1054 &vpxor ($d,$d,$a); 1055 &vprotd ($d,$d,8); 1056 1057 &vpaddd ($c,$c,$d); 1058 &vpxor ($b,$b,$c); 1059 &vprotd ($b,$b,7); 1060} 1061 1062&set_label("1x"); 1063 &vmovdqa ($a,&QWP(16*2,"eax")); # sigma 1064 &vmovdqu ($b,&QWP(0,"edx")); 1065 &vmovdqu ($c,&QWP(16,"edx")); 1066 #&vmovdqu ($d,&QWP(0,"ebx")); # already loaded 1067 &vmovdqa ($rot16,&QWP(0,"eax")); 1068 &vmovdqa ($rot24,&QWP(16,"eax")); 1069 &mov (&DWP(16*3,"esp"),"ebp"); 1070 1071 &vmovdqa (&QWP(16*0,"esp"),$a); 1072 &vmovdqa (&QWP(16*1,"esp"),$b); 1073 &vmovdqa (&QWP(16*2,"esp"),$c); 1074 &vmovdqa (&QWP(16*3,"esp"),$d); 1075 &mov ("edx",10); 1076 &jmp (&label("loop1x")); 1077 1078&set_label("outer1x",16); 1079 &vmovdqa ($d,&QWP(16*5,"eax")); # one 1080 &vmovdqa ($a,&QWP(16*0,"esp")); 1081 &vmovdqa ($b,&QWP(16*1,"esp")); 1082 &vmovdqa ($c,&QWP(16*2,"esp")); 1083 &vpaddd ($d,$d,&QWP(16*3,"esp")); 1084 &mov ("edx",10); 1085 &vmovdqa (&QWP(16*3,"esp"),$d); 1086 &jmp (&label("loop1x")); 1087 1088&set_label("loop1x",16); 1089 &XOPROUND(); 1090 &vpshufd ($c,$c,0b01001110); 1091 &vpshufd ($b,$b,0b00111001); 1092 &vpshufd ($d,$d,0b10010011); 1093 1094 &XOPROUND(); 1095 &vpshufd ($c,$c,0b01001110); 1096 &vpshufd ($b,$b,0b10010011); 1097 &vpshufd ($d,$d,0b00111001); 1098 1099 &dec ("edx"); 1100 &jnz (&label("loop1x")); 1101 1102 &vpaddd ($a,$a,&QWP(16*0,"esp")); 1103 &vpaddd ($b,$b,&QWP(16*1,"esp")); 1104 &vpaddd ($c,$c,&QWP(16*2,"esp")); 1105 &vpaddd ($d,$d,&QWP(16*3,"esp")); 1106 1107 &cmp ($len,64); 1108 &jb (&label("tail")); 1109 1110 &vpxor ($a,$a,&QWP(16*0,$inp)); # xor with input 1111 &vpxor ($b,$b,&QWP(16*1,$inp)); 1112 &vpxor ($c,$c,&QWP(16*2,$inp)); 1113 &vpxor ($d,$d,&QWP(16*3,$inp)); 1114 &lea ($inp,&DWP(16*4,$inp)); # inp+=64 1115 1116 &vmovdqu (&QWP(16*0,$out),$a); # write output 1117 &vmovdqu (&QWP(16*1,$out),$b); 1118 &vmovdqu (&QWP(16*2,$out),$c); 1119 &vmovdqu (&QWP(16*3,$out),$d); 1120 &lea ($out,&DWP(16*4,$out)); # inp+=64 1121 1122 &sub ($len,64); 1123 &jnz (&label("outer1x")); 1124 1125 &jmp (&label("done")); 1126 1127&set_label("tail"); 1128 &vmovdqa (&QWP(16*0,"esp"),$a); 1129 &vmovdqa (&QWP(16*1,"esp"),$b); 1130 &vmovdqa (&QWP(16*2,"esp"),$c); 1131 &vmovdqa (&QWP(16*3,"esp"),$d); 1132 1133 &xor ("eax","eax"); 1134 &xor ("edx","edx"); 1135 &xor ("ebp","ebp"); 1136 1137&set_label("tail_loop"); 1138 &movb ("al",&BP(0,"esp","ebp")); 1139 &movb ("dl",&BP(0,$inp,"ebp")); 1140 &lea ("ebp",&DWP(1,"ebp")); 1141 &xor ("al","dl"); 1142 &movb (&BP(-1,$out,"ebp"),"al"); 1143 &dec ($len); 1144 &jnz (&label("tail_loop")); 1145} 1146&set_label("done"); 1147 &vzeroupper (); 1148 &mov ("esp",&DWP(512,"esp")); 1149&function_end("ChaCha20_xop"); 1150} 1151 1152&asm_finish(); 1153 1154close STDOUT or die "error closing STDOUT: $!"; 1155