1#! /usr/bin/env perl 2# Copyright 2008-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Copyright (c) 2008 Andy Polyakov <appro@openssl.org> 12# 13# This module may be used under the terms of either the GNU General 14# Public License version 2 or later, the GNU Lesser General Public 15# License version 2.1 or later, the Mozilla Public License version 16# 1.1 or the BSD License. The exact terms of either license are 17# distributed along with this module. For further details see 18# http://www.openssl.org/~appro/camellia/. 19# ==================================================================== 20 21# Performance in cycles per processed byte (less is better) in 22# 'openssl speed ...' benchmark: 23# 24# AMD64 Core2 EM64T 25# -evp camellia-128-ecb 16.7 21.0 22.7 26# + over gcc 3.4.6 +25% +5% 0% 27# 28# camellia-128-cbc 15.7 20.4 21.1 29# 30# 128-bit key setup 128 216 205 cycles/key 31# + over gcc 3.4.6 +54% +39% +15% 32# 33# Numbers in "+" rows represent performance improvement over compiler 34# generated code. Key setup timings are impressive on AMD and Core2 35# thanks to 64-bit operations being covertly deployed. Improvement on 36# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it 37# apparently emulates some of 64-bit operations in [32-bit] microcode. 38 39# $output is the last argument if it looks like a file (it has an extension) 40# $flavour is the first argument if it doesn't look like a file 41$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 42$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 43 44$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 45 46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 48( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 49die "can't locate x86_64-xlate.pl"; 50 51open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 52 or die "can't call $xlate: $!"; 53*STDOUT=*OUT; 54 55sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } 56sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; 57 $r =~ s/%[er]([sd]i)/%\1l/; 58 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } 59 60$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx"; 61@S=("%r8d","%r9d","%r10d","%r11d"); 62$i0="%esi"; 63$i1="%edi"; 64$Tbl="%rbp"; # size optimization 65$inp="%r12"; 66$out="%r13"; 67$key="%r14"; 68$keyend="%r15"; 69$arg0d=$win64?"%ecx":"%edi"; 70 71# const unsigned int Camellia_SBOX[4][256]; 72# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][], 73# and [2][] - with [3][]. This is done to minimize code size. 74$SBOX1_1110=0; # Camellia_SBOX[0] 75$SBOX4_4404=4; # Camellia_SBOX[1] 76$SBOX2_0222=2048; # Camellia_SBOX[2] 77$SBOX3_3033=2052; # Camellia_SBOX[3] 78 79sub Camellia_Feistel { 80my $i=@_[0]; 81my $seed=defined(@_[1])?@_[1]:0; 82my $scale=$seed<0?-8:8; 83my $j=($i&1)*2; 84my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]); 85 86$code.=<<___; 87 xor $s0,$t0 # t0^=key[0] 88 xor $s1,$t1 # t1^=key[1] 89 movz `&hi("$t0")`,$i0 # (t0>>8)&0xff 90 movz `&lo("$t1")`,$i1 # (t1>>0)&0xff 91 mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0] 92 mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1] 93 movz `&lo("$t0")`,$i0 # (t0>>0)&0xff 94 shr \$16,$t0 95 movz `&hi("$t1")`,$i1 # (t1>>8)&0xff 96 xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0] 97 shr \$16,$t1 98 xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1] 99 movz `&hi("$t0")`,$i0 # (t0>>24)&0xff 100 movz `&lo("$t1")`,$i1 # (t1>>16)&0xff 101 xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0] 102 xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1] 103 movz `&lo("$t0")`,$i0 # (t0>>16)&0xff 104 movz `&hi("$t1")`,$i1 # (t1>>24)&0xff 105 xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0] 106 xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1] 107 mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1] 108 mov `$seed+($i+1)*$scale+4`($key),$t0 109 xor $t3,$t2 # t2^=t3 110 ror \$8,$t3 # t3=RightRotate(t3,8) 111 xor $t2,$s2 112 xor $t2,$s3 113 xor $t3,$s3 114___ 115} 116 117# void Camellia_EncryptBlock_Rounds( 118# int grandRounds, 119# const Byte plaintext[], 120# const KEY_TABLE_TYPE keyTable, 121# Byte ciphertext[]) 122$code=<<___; 123.text 124 125# V1.x API 126.globl Camellia_EncryptBlock 127.type Camellia_EncryptBlock,\@abi-omnipotent 128.align 16 129Camellia_EncryptBlock: 130.cfi_startproc 131 movl \$128,%eax 132 subl $arg0d,%eax 133 movl \$3,$arg0d 134 adcl \$0,$arg0d # keyBitLength==128?3:4 135 jmp .Lenc_rounds 136.cfi_endproc 137.size Camellia_EncryptBlock,.-Camellia_EncryptBlock 138# V2 139.globl Camellia_EncryptBlock_Rounds 140.type Camellia_EncryptBlock_Rounds,\@function,4 141.align 16 142.Lenc_rounds: 143Camellia_EncryptBlock_Rounds: 144.cfi_startproc 145 push %rbx 146.cfi_push %rbx 147 push %rbp 148.cfi_push %rbp 149 push %r13 150.cfi_push %r13 151 push %r14 152.cfi_push %r14 153 push %r15 154.cfi_push %r15 155.Lenc_prologue: 156 157 #mov %rsi,$inp # put away arguments 158 mov %rcx,$out 159 mov %rdx,$key 160 161 shl \$6,%edi # process grandRounds 162 lea .LCamellia_SBOX(%rip),$Tbl 163 lea ($key,%rdi),$keyend 164 165 mov 0(%rsi),@S[0] # load plaintext 166 mov 4(%rsi),@S[1] 167 mov 8(%rsi),@S[2] 168 bswap @S[0] 169 mov 12(%rsi),@S[3] 170 bswap @S[1] 171 bswap @S[2] 172 bswap @S[3] 173 174 call _x86_64_Camellia_encrypt 175 176 bswap @S[0] 177 bswap @S[1] 178 bswap @S[2] 179 mov @S[0],0($out) 180 bswap @S[3] 181 mov @S[1],4($out) 182 mov @S[2],8($out) 183 mov @S[3],12($out) 184 185 mov 0(%rsp),%r15 186.cfi_restore %r15 187 mov 8(%rsp),%r14 188.cfi_restore %r14 189 mov 16(%rsp),%r13 190.cfi_restore %r13 191 mov 24(%rsp),%rbp 192.cfi_restore %rbp 193 mov 32(%rsp),%rbx 194.cfi_restore %rbx 195 lea 40(%rsp),%rsp 196.cfi_adjust_cfa_offset -40 197.Lenc_epilogue: 198 ret 199.cfi_endproc 200.size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds 201 202.type _x86_64_Camellia_encrypt,\@abi-omnipotent 203.align 16 204_x86_64_Camellia_encrypt: 205.cfi_startproc 206 xor 0($key),@S[1] 207 xor 4($key),@S[0] # ^=key[0-3] 208 xor 8($key),@S[3] 209 xor 12($key),@S[2] 210.align 16 211.Leloop: 212 mov 16($key),$t1 # prefetch key[4-5] 213 mov 20($key),$t0 214 215___ 216 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); } 217$code.=<<___; 218 lea 16*4($key),$key 219 cmp $keyend,$key 220 mov 8($key),$t3 # prefetch key[2-3] 221 mov 12($key),$t2 222 je .Ledone 223 224 and @S[0],$t0 225 or @S[3],$t3 226 rol \$1,$t0 227 xor $t3,@S[2] # s2^=s3|key[3]; 228 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1); 229 and @S[2],$t2 230 or @S[1],$t1 231 rol \$1,$t2 232 xor $t1,@S[0] # s0^=s1|key[1]; 233 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1); 234 jmp .Leloop 235 236.align 16 237.Ledone: 238 xor @S[2],$t0 # SwapHalf 239 xor @S[3],$t1 240 xor @S[0],$t2 241 xor @S[1],$t3 242 243 mov $t0,@S[0] 244 mov $t1,@S[1] 245 mov $t2,@S[2] 246 mov $t3,@S[3] 247 248 .byte 0xf3,0xc3 # rep ret 249.cfi_endproc 250.size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt 251 252# V1.x API 253.globl Camellia_DecryptBlock 254.type Camellia_DecryptBlock,\@abi-omnipotent 255.align 16 256Camellia_DecryptBlock: 257.cfi_startproc 258 movl \$128,%eax 259 subl $arg0d,%eax 260 movl \$3,$arg0d 261 adcl \$0,$arg0d # keyBitLength==128?3:4 262 jmp .Ldec_rounds 263.cfi_endproc 264.size Camellia_DecryptBlock,.-Camellia_DecryptBlock 265# V2 266.globl Camellia_DecryptBlock_Rounds 267.type Camellia_DecryptBlock_Rounds,\@function,4 268.align 16 269.Ldec_rounds: 270Camellia_DecryptBlock_Rounds: 271.cfi_startproc 272 push %rbx 273.cfi_push %rbx 274 push %rbp 275.cfi_push %rbp 276 push %r13 277.cfi_push %r13 278 push %r14 279.cfi_push %r14 280 push %r15 281.cfi_push %r15 282.Ldec_prologue: 283 284 #mov %rsi,$inp # put away arguments 285 mov %rcx,$out 286 mov %rdx,$keyend 287 288 shl \$6,%edi # process grandRounds 289 lea .LCamellia_SBOX(%rip),$Tbl 290 lea ($keyend,%rdi),$key 291 292 mov 0(%rsi),@S[0] # load plaintext 293 mov 4(%rsi),@S[1] 294 mov 8(%rsi),@S[2] 295 bswap @S[0] 296 mov 12(%rsi),@S[3] 297 bswap @S[1] 298 bswap @S[2] 299 bswap @S[3] 300 301 call _x86_64_Camellia_decrypt 302 303 bswap @S[0] 304 bswap @S[1] 305 bswap @S[2] 306 mov @S[0],0($out) 307 bswap @S[3] 308 mov @S[1],4($out) 309 mov @S[2],8($out) 310 mov @S[3],12($out) 311 312 mov 0(%rsp),%r15 313.cfi_restore %r15 314 mov 8(%rsp),%r14 315.cfi_restore %r14 316 mov 16(%rsp),%r13 317.cfi_restore %r13 318 mov 24(%rsp),%rbp 319.cfi_restore %rbp 320 mov 32(%rsp),%rbx 321.cfi_restore %rbx 322 lea 40(%rsp),%rsp 323.cfi_adjust_cfa_offset -40 324.Ldec_epilogue: 325 ret 326.cfi_endproc 327.size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds 328 329.type _x86_64_Camellia_decrypt,\@abi-omnipotent 330.align 16 331_x86_64_Camellia_decrypt: 332.cfi_startproc 333 xor 0($key),@S[1] 334 xor 4($key),@S[0] # ^=key[0-3] 335 xor 8($key),@S[3] 336 xor 12($key),@S[2] 337.align 16 338.Ldloop: 339 mov -8($key),$t1 # prefetch key[4-5] 340 mov -4($key),$t0 341 342___ 343 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); } 344$code.=<<___; 345 lea -16*4($key),$key 346 cmp $keyend,$key 347 mov 0($key),$t3 # prefetch key[2-3] 348 mov 4($key),$t2 349 je .Lddone 350 351 and @S[0],$t0 352 or @S[3],$t3 353 rol \$1,$t0 354 xor $t3,@S[2] # s2^=s3|key[3]; 355 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1); 356 and @S[2],$t2 357 or @S[1],$t1 358 rol \$1,$t2 359 xor $t1,@S[0] # s0^=s1|key[1]; 360 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1); 361 362 jmp .Ldloop 363 364.align 16 365.Lddone: 366 xor @S[2],$t2 367 xor @S[3],$t3 368 xor @S[0],$t0 369 xor @S[1],$t1 370 371 mov $t2,@S[0] # SwapHalf 372 mov $t3,@S[1] 373 mov $t0,@S[2] 374 mov $t1,@S[3] 375 376 .byte 0xf3,0xc3 # rep ret 377.cfi_endproc 378.size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt 379___ 380 381sub _saveround { 382my ($rnd,$key,@T)=@_; 383my $bias=int(@T[0])?shift(@T):0; 384 385 if ($#T==3) { 386 $code.=<<___; 387 mov @T[1],`$bias+$rnd*8+0`($key) 388 mov @T[0],`$bias+$rnd*8+4`($key) 389 mov @T[3],`$bias+$rnd*8+8`($key) 390 mov @T[2],`$bias+$rnd*8+12`($key) 391___ 392 } else { 393 $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n"; 394 $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1); 395 } 396} 397 398sub _loadround { 399my ($rnd,$key,@T)=@_; 400my $bias=int(@T[0])?shift(@T):0; 401 402$code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n"; 403$code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1); 404} 405 406# shld is very slow on Intel EM64T family. Even on AMD it limits 407# instruction decode rate [because it's VectorPath] and consequently 408# performance... 409sub __rotl128 { 410my ($i0,$i1,$rot)=@_; 411 412 if ($rot) { 413 $code.=<<___; 414 mov $i0,%r11 415 shld \$$rot,$i1,$i0 416 shld \$$rot,%r11,$i1 417___ 418 } 419} 420 421# ... Implementing 128-bit rotate without shld gives 80% better 422# performance EM64T, +15% on AMD64 and only ~7% degradation on 423# Core2. This is therefore preferred. 424sub _rotl128 { 425my ($i0,$i1,$rot)=@_; 426 427 if ($rot) { 428 $code.=<<___; 429 mov $i0,%r11 430 shl \$$rot,$i0 431 mov $i1,%r9 432 shr \$`64-$rot`,%r9 433 shr \$`64-$rot`,%r11 434 or %r9,$i0 435 shl \$$rot,$i1 436 or %r11,$i1 437___ 438 } 439} 440 441{ my $step=0; 442 443$code.=<<___; 444.globl Camellia_Ekeygen 445.type Camellia_Ekeygen,\@function,3 446.align 16 447Camellia_Ekeygen: 448.cfi_startproc 449 push %rbx 450.cfi_push %rbx 451 push %rbp 452.cfi_push %rbp 453 push %r13 454.cfi_push %r13 455 push %r14 456.cfi_push %r14 457 push %r15 458.cfi_push %r15 459.Lkey_prologue: 460 461 mov %edi,${keyend}d # put away arguments, keyBitLength 462 mov %rdx,$out # keyTable 463 464 mov 0(%rsi),@S[0] # load 0-127 bits 465 mov 4(%rsi),@S[1] 466 mov 8(%rsi),@S[2] 467 mov 12(%rsi),@S[3] 468 469 bswap @S[0] 470 bswap @S[1] 471 bswap @S[2] 472 bswap @S[3] 473___ 474 &_saveround (0,$out,@S); # KL<<<0 475$code.=<<___; 476 cmp \$128,$keyend # check keyBitLength 477 je .L1st128 478 479 mov 16(%rsi),@S[0] # load 128-191 bits 480 mov 20(%rsi),@S[1] 481 cmp \$192,$keyend 482 je .L1st192 483 mov 24(%rsi),@S[2] # load 192-255 bits 484 mov 28(%rsi),@S[3] 485 jmp .L1st256 486.L1st192: 487 mov @S[0],@S[2] 488 mov @S[1],@S[3] 489 not @S[2] 490 not @S[3] 491.L1st256: 492 bswap @S[0] 493 bswap @S[1] 494 bswap @S[2] 495 bswap @S[3] 496___ 497 &_saveround (4,$out,@S); # temp storage for KR! 498$code.=<<___; 499 xor 0($out),@S[1] # KR^KL 500 xor 4($out),@S[0] 501 xor 8($out),@S[3] 502 xor 12($out),@S[2] 503 504.L1st128: 505 lea .LCamellia_SIGMA(%rip),$key 506 lea .LCamellia_SBOX(%rip),$Tbl 507 508 mov 0($key),$t1 509 mov 4($key),$t0 510___ 511 &Camellia_Feistel($step++); 512 &Camellia_Feistel($step++); 513$code.=<<___; 514 xor 0($out),@S[1] # ^KL 515 xor 4($out),@S[0] 516 xor 8($out),@S[3] 517 xor 12($out),@S[2] 518___ 519 &Camellia_Feistel($step++); 520 &Camellia_Feistel($step++); 521$code.=<<___; 522 cmp \$128,$keyend 523 jne .L2nd256 524 525 lea 128($out),$out # size optimization 526 shl \$32,%r8 # @S[0]|| 527 shl \$32,%r10 # @S[2]|| 528 or %r9,%r8 # ||@S[1] 529 or %r11,%r10 # ||@S[3] 530___ 531 &_loadround (0,$out,-128,"%rax","%rbx"); # KL 532 &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0 533 &_rotl128 ("%rax","%rbx",15); 534 &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15 535 &_rotl128 ("%r8","%r10",15); 536 &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15 537 &_rotl128 ("%r8","%r10",15); # 15+15=30 538 &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30 539 &_rotl128 ("%rax","%rbx",30); # 15+30=45 540 &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45 541 &_rotl128 ("%r8","%r10",15); # 30+15=45 542 &_saveround (12,$out,-128,"%r8"); # KA<<<45 543 &_rotl128 ("%rax","%rbx",15); # 45+15=60 544 &_saveround (13,$out,-128,"%rbx"); # KL<<<60 545 &_rotl128 ("%r8","%r10",15); # 45+15=60 546 &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60 547 &_rotl128 ("%rax","%rbx",17); # 60+17=77 548 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77 549 &_rotl128 ("%rax","%rbx",17); # 77+17=94 550 &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94 551 &_rotl128 ("%r8","%r10",34); # 60+34=94 552 &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94 553 &_rotl128 ("%rax","%rbx",17); # 94+17=111 554 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111 555 &_rotl128 ("%r8","%r10",17); # 94+17=111 556 &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111 557$code.=<<___; 558 mov \$3,%eax 559 jmp .Ldone 560.align 16 561.L2nd256: 562___ 563 &_saveround (6,$out,@S); # temp storage for KA! 564$code.=<<___; 565 xor `4*8+0`($out),@S[1] # KA^KR 566 xor `4*8+4`($out),@S[0] 567 xor `5*8+0`($out),@S[3] 568 xor `5*8+4`($out),@S[2] 569___ 570 &Camellia_Feistel($step++); 571 &Camellia_Feistel($step++); 572 573 &_loadround (0,$out,"%rax","%rbx"); # KL 574 &_loadround (4,$out,"%rcx","%rdx"); # KR 575 &_loadround (6,$out,"%r14","%r15"); # KA 576$code.=<<___; 577 lea 128($out),$out # size optimization 578 shl \$32,%r8 # @S[0]|| 579 shl \$32,%r10 # @S[2]|| 580 or %r9,%r8 # ||@S[1] 581 or %r11,%r10 # ||@S[3] 582___ 583 &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0 584 &_rotl128 ("%rcx","%rdx",15); 585 &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15 586 &_rotl128 ("%r14","%r15",15); 587 &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15 588 &_rotl128 ("%rcx","%rdx",15); # 15+15=30 589 &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30 590 &_rotl128 ("%r8","%r10",30); 591 &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30 592 &_rotl128 ("%rax","%rbx",45); 593 &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45 594 &_rotl128 ("%r14","%r15",30); # 15+30=45 595 &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45 596 &_rotl128 ("%rax","%rbx",15); # 45+15=60 597 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60 598 &_rotl128 ("%rcx","%rdx",30); # 30+30=60 599 &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60 600 &_rotl128 ("%r8","%r10",30); # 30+30=60 601 &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60 602 &_rotl128 ("%rax","%rbx",17); # 60+17=77 603 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77 604 &_rotl128 ("%r14","%r15",32); # 45+32=77 605 &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77 606 &_rotl128 ("%rcx","%rdx",34); # 60+34=94 607 &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94 608 &_rotl128 ("%r14","%r15",17); # 77+17=94 609 &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77 610 &_rotl128 ("%rax","%rbx",34); # 77+34=111 611 &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111 612 &_rotl128 ("%r8","%r10",51); # 60+51=111 613 &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111 614$code.=<<___; 615 mov \$4,%eax 616.Ldone: 617 mov 0(%rsp),%r15 618.cfi_restore %r15 619 mov 8(%rsp),%r14 620.cfi_restore %r14 621 mov 16(%rsp),%r13 622.cfi_restore %r13 623 mov 24(%rsp),%rbp 624.cfi_restore %rbp 625 mov 32(%rsp),%rbx 626.cfi_restore %rbx 627 lea 40(%rsp),%rsp 628.cfi_adjust_cfa_offset -40 629.Lkey_epilogue: 630 ret 631.cfi_endproc 632.size Camellia_Ekeygen,.-Camellia_Ekeygen 633___ 634} 635 636@SBOX=( 637112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65, 638 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189, 639134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26, 640166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77, 641139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153, 642223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215, 643 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34, 644254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80, 645170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210, 646 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148, 647135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226, 648 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46, 649233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89, 650120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250, 651114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164, 652 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158); 653 654sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); } 655sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); } 656sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); } 657sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); } 658 659$code.=<<___; 660.section .rodata align=64 661.align 64 662.LCamellia_SIGMA: 663.long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858 664.long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5 665.long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2 666.long 0, 0, 0, 0 667.LCamellia_SBOX: 668___ 669# tables are interleaved, remember? 670sub data_word { $code.=".long\t".join(',',@_)."\n"; } 671for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); } 672for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); } 673 674# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out, 675# size_t length, const CAMELLIA_KEY *key, 676# unsigned char *ivp,const int enc); 677{ 678$_key="0(%rsp)"; 679$_end="8(%rsp)"; # inp+len&~15 680$_res="16(%rsp)"; # len&15 681$ivec="24(%rsp)"; 682$_ivp="40(%rsp)"; 683$_rsp="48(%rsp)"; 684 685$code.=<<___; 686.text 687.globl Camellia_cbc_encrypt 688.type Camellia_cbc_encrypt,\@function,6 689.align 16 690Camellia_cbc_encrypt: 691.cfi_startproc 692 endbranch 693 cmp \$0,%rdx 694 je .Lcbc_abort 695 push %rbx 696.cfi_push %rbx 697 push %rbp 698.cfi_push %rbp 699 push %r12 700.cfi_push %r12 701 push %r13 702.cfi_push %r13 703 push %r14 704.cfi_push %r14 705 push %r15 706.cfi_push %r15 707.Lcbc_prologue: 708 709 mov %rsp,%rbp 710.cfi_def_cfa_register %rbp 711 sub \$64,%rsp 712 and \$-64,%rsp 713 714 # place stack frame just "above mod 1024" the key schedule, 715 # this ensures that cache associativity suffices 716 lea -64-63(%rcx),%r10 717 sub %rsp,%r10 718 neg %r10 719 and \$0x3C0,%r10 720 sub %r10,%rsp 721 #add \$8,%rsp # 8 is reserved for callee's ra 722 723 mov %rdi,$inp # inp argument 724 mov %rsi,$out # out argument 725 mov %r8,%rbx # ivp argument 726 mov %rcx,$key # key argument 727 mov 272(%rcx),${keyend}d # grandRounds 728 729 mov %r8,$_ivp 730 mov %rbp,$_rsp 731.cfi_cfa_expression $_rsp,deref,+56 732 733.Lcbc_body: 734 lea .LCamellia_SBOX(%rip),$Tbl 735 736 mov \$32,%ecx 737.align 4 738.Lcbc_prefetch_sbox: 739 mov 0($Tbl),%rax 740 mov 32($Tbl),%rsi 741 mov 64($Tbl),%rdi 742 mov 96($Tbl),%r11 743 lea 128($Tbl),$Tbl 744 loop .Lcbc_prefetch_sbox 745 sub \$4096,$Tbl 746 shl \$6,$keyend 747 mov %rdx,%rcx # len argument 748 lea ($key,$keyend),$keyend 749 750 cmp \$0,%r9d # enc argument 751 je .LCBC_DECRYPT 752 753 and \$-16,%rdx 754 and \$15,%rcx # length residue 755 lea ($inp,%rdx),%rdx 756 mov $key,$_key 757 mov %rdx,$_end 758 mov %rcx,$_res 759 760 cmp $inp,%rdx 761 mov 0(%rbx),@S[0] # load IV 762 mov 4(%rbx),@S[1] 763 mov 8(%rbx),@S[2] 764 mov 12(%rbx),@S[3] 765 je .Lcbc_enc_tail 766 jmp .Lcbc_eloop 767 768.align 16 769.Lcbc_eloop: 770 xor 0($inp),@S[0] 771 xor 4($inp),@S[1] 772 xor 8($inp),@S[2] 773 bswap @S[0] 774 xor 12($inp),@S[3] 775 bswap @S[1] 776 bswap @S[2] 777 bswap @S[3] 778 779 call _x86_64_Camellia_encrypt 780 781 mov $_key,$key # "rewind" the key 782 bswap @S[0] 783 mov $_end,%rdx 784 bswap @S[1] 785 mov $_res,%rcx 786 bswap @S[2] 787 mov @S[0],0($out) 788 bswap @S[3] 789 mov @S[1],4($out) 790 mov @S[2],8($out) 791 lea 16($inp),$inp 792 mov @S[3],12($out) 793 cmp %rdx,$inp 794 lea 16($out),$out 795 jne .Lcbc_eloop 796 797 cmp \$0,%rcx 798 jne .Lcbc_enc_tail 799 800 mov $_ivp,$out 801 mov @S[0],0($out) # write out IV residue 802 mov @S[1],4($out) 803 mov @S[2],8($out) 804 mov @S[3],12($out) 805 jmp .Lcbc_done 806 807.align 16 808.Lcbc_enc_tail: 809 xor %rax,%rax 810 mov %rax,0+$ivec 811 mov %rax,8+$ivec 812 mov %rax,$_res 813 814.Lcbc_enc_pushf: 815 pushfq 816 cld 817 mov $inp,%rsi 818 lea 8+$ivec,%rdi 819 .long 0x9066A4F3 # rep movsb 820 popfq 821.Lcbc_enc_popf: 822 823 lea $ivec,$inp 824 lea 16+$ivec,%rax 825 mov %rax,$_end 826 jmp .Lcbc_eloop # one more time 827 828.align 16 829.LCBC_DECRYPT: 830 xchg $key,$keyend 831 add \$15,%rdx 832 and \$15,%rcx # length residue 833 and \$-16,%rdx 834 mov $key,$_key 835 lea ($inp,%rdx),%rdx 836 mov %rdx,$_end 837 mov %rcx,$_res 838 839 mov (%rbx),%rax # load IV 840 mov 8(%rbx),%rbx 841 jmp .Lcbc_dloop 842.align 16 843.Lcbc_dloop: 844 mov 0($inp),@S[0] 845 mov 4($inp),@S[1] 846 mov 8($inp),@S[2] 847 bswap @S[0] 848 mov 12($inp),@S[3] 849 bswap @S[1] 850 mov %rax,0+$ivec # save IV to temporary storage 851 bswap @S[2] 852 mov %rbx,8+$ivec 853 bswap @S[3] 854 855 call _x86_64_Camellia_decrypt 856 857 mov $_key,$key # "rewind" the key 858 mov $_end,%rdx 859 mov $_res,%rcx 860 861 bswap @S[0] 862 mov ($inp),%rax # load IV for next iteration 863 bswap @S[1] 864 mov 8($inp),%rbx 865 bswap @S[2] 866 xor 0+$ivec,@S[0] 867 bswap @S[3] 868 xor 4+$ivec,@S[1] 869 xor 8+$ivec,@S[2] 870 lea 16($inp),$inp 871 xor 12+$ivec,@S[3] 872 cmp %rdx,$inp 873 je .Lcbc_ddone 874 875 mov @S[0],0($out) 876 mov @S[1],4($out) 877 mov @S[2],8($out) 878 mov @S[3],12($out) 879 880 lea 16($out),$out 881 jmp .Lcbc_dloop 882 883.align 16 884.Lcbc_ddone: 885 mov $_ivp,%rdx 886 cmp \$0,%rcx 887 jne .Lcbc_dec_tail 888 889 mov @S[0],0($out) 890 mov @S[1],4($out) 891 mov @S[2],8($out) 892 mov @S[3],12($out) 893 894 mov %rax,(%rdx) # write out IV residue 895 mov %rbx,8(%rdx) 896 jmp .Lcbc_done 897.align 16 898.Lcbc_dec_tail: 899 mov @S[0],0+$ivec 900 mov @S[1],4+$ivec 901 mov @S[2],8+$ivec 902 mov @S[3],12+$ivec 903 904.Lcbc_dec_pushf: 905 pushfq 906 cld 907 lea 8+$ivec,%rsi 908 lea ($out),%rdi 909 .long 0x9066A4F3 # rep movsb 910 popfq 911.Lcbc_dec_popf: 912 913 mov %rax,(%rdx) # write out IV residue 914 mov %rbx,8(%rdx) 915 jmp .Lcbc_done 916 917.align 16 918.Lcbc_done: 919 mov $_rsp,%rcx 920.cfi_def_cfa %rcx,56 921 mov 0(%rcx),%r15 922.cfi_restore %r15 923 mov 8(%rcx),%r14 924.cfi_restore %r14 925 mov 16(%rcx),%r13 926.cfi_restore %r13 927 mov 24(%rcx),%r12 928.cfi_restore %r12 929 mov 32(%rcx),%rbp 930.cfi_restore %rbp 931 mov 40(%rcx),%rbx 932.cfi_restore %rbx 933 lea 48(%rcx),%rsp 934.cfi_def_cfa %rsp,8 935.Lcbc_abort: 936 ret 937.cfi_endproc 938.size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt 939 940.asciz "Camellia for x86_64 by <appro\@openssl.org>" 941___ 942} 943 944# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 945# CONTEXT *context,DISPATCHER_CONTEXT *disp) 946if ($win64) { 947$rec="%rcx"; 948$frame="%rdx"; 949$context="%r8"; 950$disp="%r9"; 951 952$code.=<<___; 953.extern __imp_RtlVirtualUnwind 954.type common_se_handler,\@abi-omnipotent 955.align 16 956common_se_handler: 957 push %rsi 958 push %rdi 959 push %rbx 960 push %rbp 961 push %r12 962 push %r13 963 push %r14 964 push %r15 965 pushfq 966 lea -64(%rsp),%rsp 967 968 mov 120($context),%rax # pull context->Rax 969 mov 248($context),%rbx # pull context->Rip 970 971 mov 8($disp),%rsi # disp->ImageBase 972 mov 56($disp),%r11 # disp->HandlerData 973 974 mov 0(%r11),%r10d # HandlerData[0] 975 lea (%rsi,%r10),%r10 # prologue label 976 cmp %r10,%rbx # context->Rip<prologue label 977 jb .Lin_prologue 978 979 mov 152($context),%rax # pull context->Rsp 980 981 mov 4(%r11),%r10d # HandlerData[1] 982 lea (%rsi,%r10),%r10 # epilogue label 983 cmp %r10,%rbx # context->Rip>=epilogue label 984 jae .Lin_prologue 985 986 lea 40(%rax),%rax 987 mov -8(%rax),%rbx 988 mov -16(%rax),%rbp 989 mov -24(%rax),%r13 990 mov -32(%rax),%r14 991 mov -40(%rax),%r15 992 mov %rbx,144($context) # restore context->Rbx 993 mov %rbp,160($context) # restore context->Rbp 994 mov %r13,224($context) # restore context->R13 995 mov %r14,232($context) # restore context->R14 996 mov %r15,240($context) # restore context->R15 997 998.Lin_prologue: 999 mov 8(%rax),%rdi 1000 mov 16(%rax),%rsi 1001 mov %rax,152($context) # restore context->Rsp 1002 mov %rsi,168($context) # restore context->Rsi 1003 mov %rdi,176($context) # restore context->Rdi 1004 1005 jmp .Lcommon_seh_exit 1006.size common_se_handler,.-common_se_handler 1007 1008.type cbc_se_handler,\@abi-omnipotent 1009.align 16 1010cbc_se_handler: 1011 push %rsi 1012 push %rdi 1013 push %rbx 1014 push %rbp 1015 push %r12 1016 push %r13 1017 push %r14 1018 push %r15 1019 pushfq 1020 lea -64(%rsp),%rsp 1021 1022 mov 120($context),%rax # pull context->Rax 1023 mov 248($context),%rbx # pull context->Rip 1024 1025 lea .Lcbc_prologue(%rip),%r10 1026 cmp %r10,%rbx # context->Rip<.Lcbc_prologue 1027 jb .Lin_cbc_prologue 1028 1029 lea .Lcbc_body(%rip),%r10 1030 cmp %r10,%rbx # context->Rip<.Lcbc_body 1031 jb .Lin_cbc_frame_setup 1032 1033 mov 152($context),%rax # pull context->Rsp 1034 1035 lea .Lcbc_abort(%rip),%r10 1036 cmp %r10,%rbx # context->Rip>=.Lcbc_abort 1037 jae .Lin_cbc_prologue 1038 1039 # handle pushf/popf in Camellia_cbc_encrypt 1040 lea .Lcbc_enc_pushf(%rip),%r10 1041 cmp %r10,%rbx # context->Rip<=.Lcbc_enc_pushf 1042 jbe .Lin_cbc_no_flag 1043 lea 8(%rax),%rax 1044 lea .Lcbc_enc_popf(%rip),%r10 1045 cmp %r10,%rbx # context->Rip<.Lcbc_enc_popf 1046 jb .Lin_cbc_no_flag 1047 lea -8(%rax),%rax 1048 lea .Lcbc_dec_pushf(%rip),%r10 1049 cmp %r10,%rbx # context->Rip<=.Lcbc_dec_pushf 1050 jbe .Lin_cbc_no_flag 1051 lea 8(%rax),%rax 1052 lea .Lcbc_dec_popf(%rip),%r10 1053 cmp %r10,%rbx # context->Rip<.Lcbc_dec_popf 1054 jb .Lin_cbc_no_flag 1055 lea -8(%rax),%rax 1056 1057.Lin_cbc_no_flag: 1058 mov 48(%rax),%rax # $_rsp 1059 lea 48(%rax),%rax 1060 1061.Lin_cbc_frame_setup: 1062 mov -8(%rax),%rbx 1063 mov -16(%rax),%rbp 1064 mov -24(%rax),%r12 1065 mov -32(%rax),%r13 1066 mov -40(%rax),%r14 1067 mov -48(%rax),%r15 1068 mov %rbx,144($context) # restore context->Rbx 1069 mov %rbp,160($context) # restore context->Rbp 1070 mov %r12,216($context) # restore context->R12 1071 mov %r13,224($context) # restore context->R13 1072 mov %r14,232($context) # restore context->R14 1073 mov %r15,240($context) # restore context->R15 1074 1075.Lin_cbc_prologue: 1076 mov 8(%rax),%rdi 1077 mov 16(%rax),%rsi 1078 mov %rax,152($context) # restore context->Rsp 1079 mov %rsi,168($context) # restore context->Rsi 1080 mov %rdi,176($context) # restore context->Rdi 1081 1082.align 4 1083.Lcommon_seh_exit: 1084 1085 mov 40($disp),%rdi # disp->ContextRecord 1086 mov $context,%rsi # context 1087 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 1088 .long 0xa548f3fc # cld; rep movsq 1089 1090 mov $disp,%rsi 1091 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1092 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1093 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1094 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1095 mov 40(%rsi),%r10 # disp->ContextRecord 1096 lea 56(%rsi),%r11 # &disp->HandlerData 1097 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1098 mov %r10,32(%rsp) # arg5 1099 mov %r11,40(%rsp) # arg6 1100 mov %r12,48(%rsp) # arg7 1101 mov %rcx,56(%rsp) # arg8, (NULL) 1102 call *__imp_RtlVirtualUnwind(%rip) 1103 1104 mov \$1,%eax # ExceptionContinueSearch 1105 lea 64(%rsp),%rsp 1106 popfq 1107 pop %r15 1108 pop %r14 1109 pop %r13 1110 pop %r12 1111 pop %rbp 1112 pop %rbx 1113 pop %rdi 1114 pop %rsi 1115 ret 1116.size cbc_se_handler,.-cbc_se_handler 1117 1118.section .pdata 1119.align 4 1120 .rva .LSEH_begin_Camellia_EncryptBlock_Rounds 1121 .rva .LSEH_end_Camellia_EncryptBlock_Rounds 1122 .rva .LSEH_info_Camellia_EncryptBlock_Rounds 1123 1124 .rva .LSEH_begin_Camellia_DecryptBlock_Rounds 1125 .rva .LSEH_end_Camellia_DecryptBlock_Rounds 1126 .rva .LSEH_info_Camellia_DecryptBlock_Rounds 1127 1128 .rva .LSEH_begin_Camellia_Ekeygen 1129 .rva .LSEH_end_Camellia_Ekeygen 1130 .rva .LSEH_info_Camellia_Ekeygen 1131 1132 .rva .LSEH_begin_Camellia_cbc_encrypt 1133 .rva .LSEH_end_Camellia_cbc_encrypt 1134 .rva .LSEH_info_Camellia_cbc_encrypt 1135 1136.section .xdata 1137.align 8 1138.LSEH_info_Camellia_EncryptBlock_Rounds: 1139 .byte 9,0,0,0 1140 .rva common_se_handler 1141 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] 1142.LSEH_info_Camellia_DecryptBlock_Rounds: 1143 .byte 9,0,0,0 1144 .rva common_se_handler 1145 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] 1146.LSEH_info_Camellia_Ekeygen: 1147 .byte 9,0,0,0 1148 .rva common_se_handler 1149 .rva .Lkey_prologue,.Lkey_epilogue # HandlerData[] 1150.LSEH_info_Camellia_cbc_encrypt: 1151 .byte 9,0,0,0 1152 .rva cbc_se_handler 1153___ 1154} 1155 1156$code =~ s/\`([^\`]*)\`/eval $1/gem; 1157print $code; 1158close STDOUT or die "error closing STDOUT: $!"; 1159