1#! /usr/bin/env perl 2# Copyright 2008-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Copyright (c) 2008 Andy Polyakov <appro@openssl.org> 12# 13# This module may be used under the terms of either the GNU General 14# Public License version 2 or later, the GNU Lesser General Public 15# License version 2.1 or later, the Mozilla Public License version 16# 1.1 or the BSD License. The exact terms of either license are 17# distributed along with this module. For further details see 18# http://www.openssl.org/~appro/camellia/. 19# ==================================================================== 20 21# Performance in cycles per processed byte (less is better) in 22# 'openssl speed ...' benchmark: 23# 24# AMD K8 Core2 PIII P4 25# -evp camellia-128-ecb 21.5 22.8 27.0 28.9 26# + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64% 27# + over icc 8.0 +48/19% +21/15% +21/17% +55/37% 28# 29# camellia-128-cbc 17.3 21.1 23.9 25.9 30# 31# 128-bit key setup 196 280 256 240 cycles/key 32# + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40% 33# + over icc 8.0 +18/3% +10/0% +10/3% +21/10% 34# 35# Pairs of numbers in "+" rows represent performance improvement over 36# compiler generated position-independent code, PIC, and non-PIC 37# respectively. PIC results are of greater relevance, as this module 38# is position-independent, i.e. suitable for a shared library or PIE. 39# Position independence "costs" one register, which is why compilers 40# are so close with non-PIC results, they have an extra register to 41# spare. CBC results are better than ECB ones thanks to "zero-copy" 42# private _x86_* interface, and are ~30-40% better than with compiler 43# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on 44# same CPU (where applicable). 45 46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47push(@INC,"${dir}","${dir}../../perlasm"); 48require "x86asm.pl"; 49 50$OPENSSL=1; 51 52$output = pop and open STDOUT,">$output"; 53 54&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 55 56@T=("eax","ebx","ecx","edx"); 57$idx="esi"; 58$key="edi"; 59$Tbl="ebp"; 60 61# stack frame layout in _x86_Camellia_* routines, frame is allocated 62# by caller 63$__ra=&DWP(0,"esp"); # return address 64$__s0=&DWP(4,"esp"); # s0 backing store 65$__s1=&DWP(8,"esp"); # s1 backing store 66$__s2=&DWP(12,"esp"); # s2 backing store 67$__s3=&DWP(16,"esp"); # s3 backing store 68$__end=&DWP(20,"esp"); # pointer to end/start of key schedule 69 70# stack frame layout in Camellia_[en|crypt] routines, which differs from 71# above by 4 and overlaps by pointer to end/start of key schedule 72$_end=&DWP(16,"esp"); 73$_esp=&DWP(20,"esp"); 74 75# const unsigned int Camellia_SBOX[4][256]; 76# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][], 77# and [2][] - with [3][]. This is done to optimize code size. 78$SBOX1_1110=0; # Camellia_SBOX[0] 79$SBOX4_4404=4; # Camellia_SBOX[1] 80$SBOX2_0222=2048; # Camellia_SBOX[2] 81$SBOX3_3033=2052; # Camellia_SBOX[3] 82&static_label("Camellia_SIGMA"); 83&static_label("Camellia_SBOX"); 84 85sub Camellia_Feistel { 86my $i=@_[0]; 87my $seed=defined(@_[1])?@_[1]:0; 88my $scale=$seed<0?-8:8; 89my $frame=defined(@_[2])?@_[2]:0; 90my $j=($i&1)*2; 91my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4]; 92 93 &xor ($t0,$idx); # t0^=key[0] 94 &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1] 95 &movz ($idx,&HB($t0)); # (t0>>8)&0xff 96 &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0] 97 &movz ($idx,&LB($t0)); # (t0>>0)&0xff 98 &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0] 99 &shr ($t0,16); 100 &movz ($idx,&LB($t1)); # (t1>>0)&0xff 101 &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1] 102 &movz ($idx,&HB($t0)); # (t0>>24)&0xff 103 &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0] 104 &movz ($idx,&HB($t1)); # (t1>>8)&0xff 105 &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1] 106 &shr ($t1,16); 107 &movz ($t0,&LB($t0)); # (t0>>16)&0xff 108 &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0] 109 &movz ($idx,&HB($t1)); # (t1>>24)&0xff 110 &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3" 111 &xor ($t2,$t3); # t2^=t3 112 &rotr ($t3,8); # t3=RightRotate(t3,8) 113 &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1] 114 &movz ($idx,&LB($t1)); # (t1>>16)&0xff 115 &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2" 116 &xor ($t3,$t0); # t3^=s3 117 &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1] 118 &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1] 119 &xor ($t3,$t2); # t3^=t2 120 &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3 121 &xor ($t2,$t1); # t2^=s2 122 &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2 123} 124 125# void Camellia_EncryptBlock_Rounds( 126# int grandRounds, 127# const Byte plaintext[], 128# const KEY_TABLE_TYPE keyTable, 129# Byte ciphertext[]) 130&function_begin("Camellia_EncryptBlock_Rounds"); 131 &mov ("eax",&wparam(0)); # load grandRounds 132 &mov ($idx,&wparam(1)); # load plaintext pointer 133 &mov ($key,&wparam(2)); # load key schedule pointer 134 135 &mov ("ebx","esp"); 136 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 137 &and ("esp",-64); 138 139 # place stack frame just "above mod 1024" the key schedule 140 # this ensures that cache associativity of 2 suffices 141 &lea ("ecx",&DWP(-64-63,$key)); 142 &sub ("ecx","esp"); 143 &neg ("ecx"); 144 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 145 &sub ("esp","ecx"); 146 &add ("esp",4); # 4 is reserved for callee's return address 147 148 &shl ("eax",6); 149 &lea ("eax",&DWP(0,$key,"eax")); 150 &mov ($_esp,"ebx"); # save %esp 151 &mov ($_end,"eax"); # save keyEnd 152 153 &call (&label("pic_point")); 154 &set_label("pic_point"); 155 &blindpop($Tbl); 156 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 157 158 &mov (@T[0],&DWP(0,$idx)); # load plaintext 159 &mov (@T[1],&DWP(4,$idx)); 160 &mov (@T[2],&DWP(8,$idx)); 161 &bswap (@T[0]); 162 &mov (@T[3],&DWP(12,$idx)); 163 &bswap (@T[1]); 164 &bswap (@T[2]); 165 &bswap (@T[3]); 166 167 &call ("_x86_Camellia_encrypt"); 168 169 &mov ("esp",$_esp); 170 &bswap (@T[0]); 171 &mov ($idx,&wparam(3)); # load ciphertext pointer 172 &bswap (@T[1]); 173 &bswap (@T[2]); 174 &bswap (@T[3]); 175 &mov (&DWP(0,$idx),@T[0]); # write ciphertext 176 &mov (&DWP(4,$idx),@T[1]); 177 &mov (&DWP(8,$idx),@T[2]); 178 &mov (&DWP(12,$idx),@T[3]); 179&function_end("Camellia_EncryptBlock_Rounds"); 180# V1.x API 181&function_begin_B("Camellia_EncryptBlock"); 182 &mov ("eax",128); 183 &sub ("eax",&wparam(0)); # load keyBitLength 184 &mov ("eax",3); 185 &adc ("eax",0); # keyBitLength==128?3:4 186 &mov (&wparam(0),"eax"); 187 &jmp (&label("Camellia_EncryptBlock_Rounds")); 188&function_end_B("Camellia_EncryptBlock"); 189 190if ($OPENSSL) { 191# void Camellia_encrypt( 192# const unsigned char *in, 193# unsigned char *out, 194# const CAMELLIA_KEY *key) 195&function_begin("Camellia_encrypt"); 196 &mov ($idx,&wparam(0)); # load plaintext pointer 197 &mov ($key,&wparam(2)); # load key schedule pointer 198 199 &mov ("ebx","esp"); 200 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 201 &and ("esp",-64); 202 &mov ("eax",&DWP(272,$key)); # load grandRounds counter 203 204 # place stack frame just "above mod 1024" the key schedule 205 # this ensures that cache associativity of 2 suffices 206 &lea ("ecx",&DWP(-64-63,$key)); 207 &sub ("ecx","esp"); 208 &neg ("ecx"); 209 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 210 &sub ("esp","ecx"); 211 &add ("esp",4); # 4 is reserved for callee's return address 212 213 &shl ("eax",6); 214 &lea ("eax",&DWP(0,$key,"eax")); 215 &mov ($_esp,"ebx"); # save %esp 216 &mov ($_end,"eax"); # save keyEnd 217 218 &call (&label("pic_point")); 219 &set_label("pic_point"); 220 &blindpop($Tbl); 221 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 222 223 &mov (@T[0],&DWP(0,$idx)); # load plaintext 224 &mov (@T[1],&DWP(4,$idx)); 225 &mov (@T[2],&DWP(8,$idx)); 226 &bswap (@T[0]); 227 &mov (@T[3],&DWP(12,$idx)); 228 &bswap (@T[1]); 229 &bswap (@T[2]); 230 &bswap (@T[3]); 231 232 &call ("_x86_Camellia_encrypt"); 233 234 &mov ("esp",$_esp); 235 &bswap (@T[0]); 236 &mov ($idx,&wparam(1)); # load ciphertext pointer 237 &bswap (@T[1]); 238 &bswap (@T[2]); 239 &bswap (@T[3]); 240 &mov (&DWP(0,$idx),@T[0]); # write ciphertext 241 &mov (&DWP(4,$idx),@T[1]); 242 &mov (&DWP(8,$idx),@T[2]); 243 &mov (&DWP(12,$idx),@T[3]); 244&function_end("Camellia_encrypt"); 245} 246 247&function_begin_B("_x86_Camellia_encrypt"); 248 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] 249 &xor (@T[1],&DWP(4,$key)); 250 &xor (@T[2],&DWP(8,$key)); 251 &xor (@T[3],&DWP(12,$key)); 252 &mov ($idx,&DWP(16,$key)); # prefetch key[4] 253 254 &mov ($__s0,@T[0]); # save s[0-3] 255 &mov ($__s1,@T[1]); 256 &mov ($__s2,@T[2]); 257 &mov ($__s3,@T[3]); 258 259&set_label("loop",16); 260 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); } 261 262 &add ($key,16*4); 263 &cmp ($key,$__end); 264 &je (&label("done")); 265 266 # @T[0-1] are preloaded, $idx is preloaded with key[0] 267 &and ($idx,@T[0]); 268 &mov (@T[3],$__s3); 269 &rotl ($idx,1); 270 &mov (@T[2],@T[3]); 271 &xor (@T[1],$idx); 272 &or (@T[2],&DWP(12,$key)); 273 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); 274 &xor (@T[2],$__s2); 275 276 &mov ($idx,&DWP(4,$key)); 277 &mov ($__s2,@T[2]); # s2^=s3|key[3]; 278 &or ($idx,@T[1]); 279 &and (@T[2],&DWP(8,$key)); 280 &xor (@T[0],$idx); 281 &rotl (@T[2],1); 282 &mov ($__s0,@T[0]); # s0^=s1|key[1]; 283 &xor (@T[3],@T[2]); 284 &mov ($idx,&DWP(16,$key)); # prefetch key[4] 285 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); 286 &jmp (&label("loop")); 287 288&set_label("done",8); 289 &mov (@T[2],@T[0]); # SwapHalf 290 &mov (@T[3],@T[1]); 291 &mov (@T[0],$__s2); 292 &mov (@T[1],$__s3); 293 &xor (@T[0],$idx); # $idx is preloaded with key[0] 294 &xor (@T[1],&DWP(4,$key)); 295 &xor (@T[2],&DWP(8,$key)); 296 &xor (@T[3],&DWP(12,$key)); 297 &ret (); 298&function_end_B("_x86_Camellia_encrypt"); 299 300# void Camellia_DecryptBlock_Rounds( 301# int grandRounds, 302# const Byte ciphertext[], 303# const KEY_TABLE_TYPE keyTable, 304# Byte plaintext[]) 305&function_begin("Camellia_DecryptBlock_Rounds"); 306 &mov ("eax",&wparam(0)); # load grandRounds 307 &mov ($idx,&wparam(1)); # load ciphertext pointer 308 &mov ($key,&wparam(2)); # load key schedule pointer 309 310 &mov ("ebx","esp"); 311 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 312 &and ("esp",-64); 313 314 # place stack frame just "above mod 1024" the key schedule 315 # this ensures that cache associativity of 2 suffices 316 &lea ("ecx",&DWP(-64-63,$key)); 317 &sub ("ecx","esp"); 318 &neg ("ecx"); 319 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 320 &sub ("esp","ecx"); 321 &add ("esp",4); # 4 is reserved for callee's return address 322 323 &shl ("eax",6); 324 &mov (&DWP(4*4,"esp"),$key); # save keyStart 325 &lea ($key,&DWP(0,$key,"eax")); 326 &mov (&DWP(5*4,"esp"),"ebx");# save %esp 327 328 &call (&label("pic_point")); 329 &set_label("pic_point"); 330 &blindpop($Tbl); 331 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 332 333 &mov (@T[0],&DWP(0,$idx)); # load ciphertext 334 &mov (@T[1],&DWP(4,$idx)); 335 &mov (@T[2],&DWP(8,$idx)); 336 &bswap (@T[0]); 337 &mov (@T[3],&DWP(12,$idx)); 338 &bswap (@T[1]); 339 &bswap (@T[2]); 340 &bswap (@T[3]); 341 342 &call ("_x86_Camellia_decrypt"); 343 344 &mov ("esp",&DWP(5*4,"esp")); 345 &bswap (@T[0]); 346 &mov ($idx,&wparam(3)); # load plaintext pointer 347 &bswap (@T[1]); 348 &bswap (@T[2]); 349 &bswap (@T[3]); 350 &mov (&DWP(0,$idx),@T[0]); # write plaintext 351 &mov (&DWP(4,$idx),@T[1]); 352 &mov (&DWP(8,$idx),@T[2]); 353 &mov (&DWP(12,$idx),@T[3]); 354&function_end("Camellia_DecryptBlock_Rounds"); 355# V1.x API 356&function_begin_B("Camellia_DecryptBlock"); 357 &mov ("eax",128); 358 &sub ("eax",&wparam(0)); # load keyBitLength 359 &mov ("eax",3); 360 &adc ("eax",0); # keyBitLength==128?3:4 361 &mov (&wparam(0),"eax"); 362 &jmp (&label("Camellia_DecryptBlock_Rounds")); 363&function_end_B("Camellia_DecryptBlock"); 364 365if ($OPENSSL) { 366# void Camellia_decrypt( 367# const unsigned char *in, 368# unsigned char *out, 369# const CAMELLIA_KEY *key) 370&function_begin("Camellia_decrypt"); 371 &mov ($idx,&wparam(0)); # load ciphertext pointer 372 &mov ($key,&wparam(2)); # load key schedule pointer 373 374 &mov ("ebx","esp"); 375 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 376 &and ("esp",-64); 377 &mov ("eax",&DWP(272,$key)); # load grandRounds counter 378 379 # place stack frame just "above mod 1024" the key schedule 380 # this ensures that cache associativity of 2 suffices 381 &lea ("ecx",&DWP(-64-63,$key)); 382 &sub ("ecx","esp"); 383 &neg ("ecx"); 384 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 385 &sub ("esp","ecx"); 386 &add ("esp",4); # 4 is reserved for callee's return address 387 388 &shl ("eax",6); 389 &mov (&DWP(4*4,"esp"),$key); # save keyStart 390 &lea ($key,&DWP(0,$key,"eax")); 391 &mov (&DWP(5*4,"esp"),"ebx");# save %esp 392 393 &call (&label("pic_point")); 394 &set_label("pic_point"); 395 &blindpop($Tbl); 396 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 397 398 &mov (@T[0],&DWP(0,$idx)); # load ciphertext 399 &mov (@T[1],&DWP(4,$idx)); 400 &mov (@T[2],&DWP(8,$idx)); 401 &bswap (@T[0]); 402 &mov (@T[3],&DWP(12,$idx)); 403 &bswap (@T[1]); 404 &bswap (@T[2]); 405 &bswap (@T[3]); 406 407 &call ("_x86_Camellia_decrypt"); 408 409 &mov ("esp",&DWP(5*4,"esp")); 410 &bswap (@T[0]); 411 &mov ($idx,&wparam(1)); # load plaintext pointer 412 &bswap (@T[1]); 413 &bswap (@T[2]); 414 &bswap (@T[3]); 415 &mov (&DWP(0,$idx),@T[0]); # write plaintext 416 &mov (&DWP(4,$idx),@T[1]); 417 &mov (&DWP(8,$idx),@T[2]); 418 &mov (&DWP(12,$idx),@T[3]); 419&function_end("Camellia_decrypt"); 420} 421 422&function_begin_B("_x86_Camellia_decrypt"); 423 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] 424 &xor (@T[1],&DWP(4,$key)); 425 &xor (@T[2],&DWP(8,$key)); 426 &xor (@T[3],&DWP(12,$key)); 427 &mov ($idx,&DWP(-8,$key)); # prefetch key[-2] 428 429 &mov ($__s0,@T[0]); # save s[0-3] 430 &mov ($__s1,@T[1]); 431 &mov ($__s2,@T[2]); 432 &mov ($__s3,@T[3]); 433 434&set_label("loop",16); 435 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); } 436 437 &sub ($key,16*4); 438 &cmp ($key,$__end); 439 &je (&label("done")); 440 441 # @T[0-1] are preloaded, $idx is preloaded with key[2] 442 &and ($idx,@T[0]); 443 &mov (@T[3],$__s3); 444 &rotl ($idx,1); 445 &mov (@T[2],@T[3]); 446 &xor (@T[1],$idx); 447 &or (@T[2],&DWP(4,$key)); 448 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); 449 &xor (@T[2],$__s2); 450 451 &mov ($idx,&DWP(12,$key)); 452 &mov ($__s2,@T[2]); # s2^=s3|key[3]; 453 &or ($idx,@T[1]); 454 &and (@T[2],&DWP(0,$key)); 455 &xor (@T[0],$idx); 456 &rotl (@T[2],1); 457 &mov ($__s0,@T[0]); # s0^=s1|key[1]; 458 &xor (@T[3],@T[2]); 459 &mov ($idx,&DWP(-8,$key)); # prefetch key[4] 460 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); 461 &jmp (&label("loop")); 462 463&set_label("done",8); 464 &mov (@T[2],@T[0]); # SwapHalf 465 &mov (@T[3],@T[1]); 466 &mov (@T[0],$__s2); 467 &mov (@T[1],$__s3); 468 &xor (@T[2],$idx); # $idx is preloaded with key[2] 469 &xor (@T[3],&DWP(12,$key)); 470 &xor (@T[0],&DWP(0,$key)); 471 &xor (@T[1],&DWP(4,$key)); 472 &ret (); 473&function_end_B("_x86_Camellia_decrypt"); 474 475# shld is very slow on Intel P4 family. Even on AMD it limits 476# instruction decode rate [because it's VectorPath] and consequently 477# performance. PIII, PM and Core[2] seem to be the only ones which 478# execute this code ~7% faster... 479sub __rotl128 { 480 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; 481 482 $rnd *= 2; 483 if ($rot) { 484 &mov ($idx,$i0); 485 &shld ($i0,$i1,$rot); 486 &shld ($i1,$i2,$rot); 487 &shld ($i2,$i3,$rot); 488 &shld ($i3,$idx,$rot); 489 } 490 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); 491 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); 492 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); 493 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); 494} 495 496# ... Implementing 128-bit rotate without shld gives >3x performance 497# improvement on P4, only ~7% degradation on other Intel CPUs and 498# not worse performance on AMD. This is therefore preferred. 499sub _rotl128 { 500 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; 501 502 $rnd *= 2; 503 if ($rot) { 504 &mov ($Tbl,$i0); 505 &shl ($i0,$rot); 506 &mov ($idx,$i1); 507 &shr ($idx,32-$rot); 508 &shl ($i1,$rot); 509 &or ($i0,$idx); 510 &mov ($idx,$i2); 511 &shl ($i2,$rot); 512 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); 513 &shr ($idx,32-$rot); 514 &or ($i1,$idx); 515 &shr ($Tbl,32-$rot); 516 &mov ($idx,$i3); 517 &shr ($idx,32-$rot); 518 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); 519 &shl ($i3,$rot); 520 &or ($i2,$idx); 521 &or ($i3,$Tbl); 522 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); 523 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); 524 } else { 525 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); 526 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); 527 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); 528 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); 529 } 530} 531 532sub _saveround { 533my ($rnd,$key,@T)=@_; 534my $bias=int(@T[0])?shift(@T):0; 535 536 &mov (&DWP($bias+$rnd*8+0,$key),@T[0]); 537 &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1); 538 &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2); 539 &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3); 540} 541 542sub _loadround { 543my ($rnd,$key,@T)=@_; 544my $bias=int(@T[0])?shift(@T):0; 545 546 &mov (@T[0],&DWP($bias+$rnd*8+0,$key)); 547 &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1); 548 &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2); 549 &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3); 550} 551 552# void Camellia_Ekeygen( 553# const int keyBitLength, 554# const Byte *rawKey, 555# KEY_TABLE_TYPE keyTable) 556&function_begin("Camellia_Ekeygen"); 557{ my $step=0; 558 559 &stack_push(4); # place for s[0-3] 560 561 &mov ($Tbl,&wparam(0)); # load arguments 562 &mov ($idx,&wparam(1)); 563 &mov ($key,&wparam(2)); 564 565 &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits 566 &mov (@T[1],&DWP(4,$idx)); 567 &mov (@T[2],&DWP(8,$idx)); 568 &mov (@T[3],&DWP(12,$idx)); 569 570 &bswap (@T[0]); 571 &bswap (@T[1]); 572 &bswap (@T[2]); 573 &bswap (@T[3]); 574 575 &_saveround (0,$key,@T); # KL<<<0 576 577 &cmp ($Tbl,128); 578 &je (&label("1st128")); 579 580 &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits 581 &mov (@T[1],&DWP(20,$idx)); 582 &cmp ($Tbl,192); 583 &je (&label("1st192")); 584 &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits 585 &mov (@T[3],&DWP(28,$idx)); 586 &jmp (&label("1st256")); 587&set_label("1st192",4); 588 &mov (@T[2],@T[0]); 589 &mov (@T[3],@T[1]); 590 ¬ (@T[2]); 591 ¬ (@T[3]); 592&set_label("1st256",4); 593 &bswap (@T[0]); 594 &bswap (@T[1]); 595 &bswap (@T[2]); 596 &bswap (@T[3]); 597 598 &_saveround (4,$key,@T); # temporary storage for KR! 599 600 &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL 601 &xor (@T[1],&DWP(0*8+4,$key)); 602 &xor (@T[2],&DWP(1*8+0,$key)); 603 &xor (@T[3],&DWP(1*8+4,$key)); 604 605&set_label("1st128",4); 606 &call (&label("pic_point")); 607 &set_label("pic_point"); 608 &blindpop($Tbl); 609 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 610 &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl)); 611 612 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0] 613 &mov (&swtmp(0),@T[0]); # save s[0-3] 614 &mov (&swtmp(1),@T[1]); 615 &mov (&swtmp(2),@T[2]); 616 &mov (&swtmp(3),@T[3]); 617 &Camellia_Feistel($step++); 618 &Camellia_Feistel($step++); 619 &mov (@T[2],&swtmp(2)); 620 &mov (@T[3],&swtmp(3)); 621 622 &mov ($idx,&wparam(2)); 623 &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL 624 &xor (@T[1],&DWP(0*8+4,$idx)); 625 &xor (@T[2],&DWP(1*8+0,$idx)); 626 &xor (@T[3],&DWP(1*8+4,$idx)); 627 628 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4] 629 &mov (&swtmp(0),@T[0]); # save s[0-3] 630 &mov (&swtmp(1),@T[1]); 631 &mov (&swtmp(2),@T[2]); 632 &mov (&swtmp(3),@T[3]); 633 &Camellia_Feistel($step++); 634 &Camellia_Feistel($step++); 635 &mov (@T[2],&swtmp(2)); 636 &mov (@T[3],&swtmp(3)); 637 638 &mov ($idx,&wparam(0)); 639 &cmp ($idx,128); 640 &jne (&label("2nd256")); 641 642 &mov ($key,&wparam(2)); 643 &lea ($key,&DWP(128,$key)); # size optimization 644 645 ####### process KA 646 &_saveround (2,$key,-128,@T); # KA<<<0 647 &_rotl128 (@T,15,6,@T); # KA<<<15 648 &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30) 649 &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45) 650 &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60) 651 push (@T,shift(@T)); # rotl128(@T,32); 652 &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94) 653 &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111) 654 655 ####### process KL 656 &_loadround (0,$key,-128,@T); # load KL 657 &_rotl128 (@T,15,4,@T); # KL<<<15 658 &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45) 659 &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60) 660 &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77) 661 &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94) 662 &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111) 663 664 while (@T[0] ne "eax") # restore order 665 { unshift (@T,pop(@T)); } 666 667 &mov ("eax",3); # 3 grandRounds 668 &jmp (&label("done")); 669 670&set_label("2nd256",16); 671 &mov ($idx,&wparam(2)); 672 &_saveround (6,$idx,@T); # temporary storage for KA! 673 674 &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR 675 &xor (@T[1],&DWP(4*8+4,$idx)); 676 &xor (@T[2],&DWP(5*8+0,$idx)); 677 &xor (@T[3],&DWP(5*8+4,$idx)); 678 679 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8] 680 &mov (&swtmp(0),@T[0]); # save s[0-3] 681 &mov (&swtmp(1),@T[1]); 682 &mov (&swtmp(2),@T[2]); 683 &mov (&swtmp(3),@T[3]); 684 &Camellia_Feistel($step++); 685 &Camellia_Feistel($step++); 686 &mov (@T[2],&swtmp(2)); 687 &mov (@T[3],&swtmp(3)); 688 689 &mov ($key,&wparam(2)); 690 &lea ($key,&DWP(128,$key)); # size optimization 691 692 ####### process KB 693 &_saveround (2,$key,-128,@T); # KB<<<0 694 &_rotl128 (@T,30,10,@T); # KB<<<30 695 &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60) 696 push (@T,shift(@T)); # rotl128(@T,32); 697 &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111) 698 699 ####### process KR 700 &_loadround (4,$key,-128,@T); # load KR 701 &_rotl128 (@T,15,4,@T); # KR<<<15 702 &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30) 703 &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60) 704 push (@T,shift(@T)); # rotl128(@T,32); 705 &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94) 706 707 ####### process KA 708 &_loadround (6,$key,-128,@T); # load KA 709 &_rotl128 (@T,15,6,@T); # KA<<<15 710 &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45) 711 push (@T,shift(@T)); # rotl128(@T,32); 712 &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77) 713 &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94) 714 715 ####### process KL 716 &_loadround (0,$key,-128,@T); # load KL 717 push (@T,shift(@T)); # rotl128(@T,32); 718 &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45) 719 &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60) 720 &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77) 721 push (@T,shift(@T)); # rotl128(@T,32); 722 &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111) 723 724 while (@T[0] ne "eax") # restore order 725 { unshift (@T,pop(@T)); } 726 727 &mov ("eax",4); # 4 grandRounds 728&set_label("done"); 729 &lea ("edx",&DWP(272-128,$key)); # end of key schedule 730 &stack_pop(4); 731} 732&function_end("Camellia_Ekeygen"); 733 734if ($OPENSSL) { 735# int Camellia_set_key ( 736# const unsigned char *userKey, 737# int bits, 738# CAMELLIA_KEY *key) 739&function_begin_B("Camellia_set_key"); 740 &push ("ebx"); 741 &mov ("ecx",&wparam(0)); # pull arguments 742 &mov ("ebx",&wparam(1)); 743 &mov ("edx",&wparam(2)); 744 745 &mov ("eax",-1); 746 &test ("ecx","ecx"); 747 &jz (&label("done")); # userKey==NULL? 748 &test ("edx","edx"); 749 &jz (&label("done")); # key==NULL? 750 751 &mov ("eax",-2); 752 &cmp ("ebx",256); 753 &je (&label("arg_ok")); # bits==256? 754 &cmp ("ebx",192); 755 &je (&label("arg_ok")); # bits==192? 756 &cmp ("ebx",128); 757 &jne (&label("done")); # bits!=128? 758&set_label("arg_ok",4); 759 760 &push ("edx"); # push arguments 761 &push ("ecx"); 762 &push ("ebx"); 763 &call ("Camellia_Ekeygen"); 764 &stack_pop(3); 765 766 # eax holds grandRounds and edx points at where to put it 767 &mov (&DWP(0,"edx"),"eax"); 768 &xor ("eax","eax"); 769&set_label("done",4); 770 &pop ("ebx"); 771 &ret (); 772&function_end_B("Camellia_set_key"); 773} 774 775@SBOX=( 776112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65, 777 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189, 778134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26, 779166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77, 780139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153, 781223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215, 782 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34, 783254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80, 784170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210, 785 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148, 786135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226, 787 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46, 788233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89, 789120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250, 790114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164, 791 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158); 792 793sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; } 794sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; } 795sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; } 796sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; } 797 798&set_label("Camellia_SIGMA",64); 799&data_word( 800 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2, 801 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c, 802 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd, 803 0, 0, 0, 0); 804&set_label("Camellia_SBOX",64); 805# tables are interleaved, remember? 806for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); } 807for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); } 808 809# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out, 810# size_t length, const CAMELLIA_KEY *key, 811# unsigned char *ivp,const int enc); 812{ 813# stack frame layout 814# -4(%esp) # return address 0(%esp) 815# 0(%esp) # s0 4(%esp) 816# 4(%esp) # s1 8(%esp) 817# 8(%esp) # s2 12(%esp) 818# 12(%esp) # s3 16(%esp) 819# 16(%esp) # end of key schedule 20(%esp) 820# 20(%esp) # %esp backup 821my $_inp=&DWP(24,"esp"); #copy of wparam(0) 822my $_out=&DWP(28,"esp"); #copy of wparam(1) 823my $_len=&DWP(32,"esp"); #copy of wparam(2) 824my $_key=&DWP(36,"esp"); #copy of wparam(3) 825my $_ivp=&DWP(40,"esp"); #copy of wparam(4) 826my $ivec=&DWP(44,"esp"); #ivec[16] 827my $_tmp=&DWP(44,"esp"); #volatile variable [yes, aliases with ivec] 828my ($s0,$s1,$s2,$s3) = @T; 829 830&function_begin("Camellia_cbc_encrypt"); 831 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len 832 &cmp ($s2,0); 833 &je (&label("enc_out")); 834 835 &pushf (); 836 &cld (); 837 838 &mov ($s0,&wparam(0)); # load inp 839 &mov ($s1,&wparam(1)); # load out 840 #&mov ($s2,&wparam(2)); # load len 841 &mov ($s3,&wparam(3)); # load key 842 &mov ($Tbl,&wparam(4)); # load ivp 843 844 # allocate aligned stack frame... 845 &lea ($idx,&DWP(-64,"esp")); 846 &and ($idx,-64); 847 848 # place stack frame just "above mod 1024" the key schedule 849 # this ensures that cache associativity of 2 suffices 850 &lea ($key,&DWP(-64-63,$s3)); 851 &sub ($key,$idx); 852 &neg ($key); 853 &and ($key,0x3C0); # modulo 1024, but aligned to cache-line 854 &sub ($idx,$key); 855 856 &mov ($key,&wparam(5)); # load enc 857 858 &exch ("esp",$idx); 859 &add ("esp",4); # reserve for return address! 860 &mov ($_esp,$idx); # save %esp 861 862 &mov ($_inp,$s0); # save copy of inp 863 &mov ($_out,$s1); # save copy of out 864 &mov ($_len,$s2); # save copy of len 865 &mov ($_key,$s3); # save copy of key 866 &mov ($_ivp,$Tbl); # save copy of ivp 867 868 &call (&label("pic_point")); # make it PIC! 869 &set_label("pic_point"); 870 &blindpop($Tbl); 871 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 872 873 &mov ($idx,32); 874 &set_label("prefetch_sbox",4); 875 &mov ($s0,&DWP(0,$Tbl)); 876 &mov ($s1,&DWP(32,$Tbl)); 877 &mov ($s2,&DWP(64,$Tbl)); 878 &mov ($s3,&DWP(96,$Tbl)); 879 &lea ($Tbl,&DWP(128,$Tbl)); 880 &dec ($idx); 881 &jnz (&label("prefetch_sbox")); 882 &mov ($s0,$_key); 883 &sub ($Tbl,4096); 884 &mov ($idx,$_inp); 885 &mov ($s3,&DWP(272,$s0)); # load grandRounds 886 887 &cmp ($key,0); 888 &je (&label("DECRYPT")); 889 890 &mov ($s2,$_len); 891 &mov ($key,$_ivp); 892 &shl ($s3,6); 893 &lea ($s3,&DWP(0,$s0,$s3)); 894 &mov ($_end,$s3); 895 896 &test ($s2,0xFFFFFFF0); 897 &jz (&label("enc_tail")); # short input... 898 899 &mov ($s0,&DWP(0,$key)); # load iv 900 &mov ($s1,&DWP(4,$key)); 901 902 &set_label("enc_loop",4); 903 &mov ($s2,&DWP(8,$key)); 904 &mov ($s3,&DWP(12,$key)); 905 906 &xor ($s0,&DWP(0,$idx)); # xor input data 907 &xor ($s1,&DWP(4,$idx)); 908 &xor ($s2,&DWP(8,$idx)); 909 &bswap ($s0); 910 &xor ($s3,&DWP(12,$idx)); 911 &bswap ($s1); 912 &mov ($key,$_key); # load key 913 &bswap ($s2); 914 &bswap ($s3); 915 916 &call ("_x86_Camellia_encrypt"); 917 918 &mov ($idx,$_inp); # load inp 919 &mov ($key,$_out); # load out 920 921 &bswap ($s0); 922 &bswap ($s1); 923 &bswap ($s2); 924 &mov (&DWP(0,$key),$s0); # save output data 925 &bswap ($s3); 926 &mov (&DWP(4,$key),$s1); 927 &mov (&DWP(8,$key),$s2); 928 &mov (&DWP(12,$key),$s3); 929 930 &mov ($s2,$_len); # load len 931 932 &lea ($idx,&DWP(16,$idx)); 933 &mov ($_inp,$idx); # save inp 934 935 &lea ($s3,&DWP(16,$key)); 936 &mov ($_out,$s3); # save out 937 938 &sub ($s2,16); 939 &test ($s2,0xFFFFFFF0); 940 &mov ($_len,$s2); # save len 941 &jnz (&label("enc_loop")); 942 &test ($s2,15); 943 &jnz (&label("enc_tail")); 944 &mov ($idx,$_ivp); # load ivp 945 &mov ($s2,&DWP(8,$key)); # restore last dwords 946 &mov ($s3,&DWP(12,$key)); 947 &mov (&DWP(0,$idx),$s0); # save ivec 948 &mov (&DWP(4,$idx),$s1); 949 &mov (&DWP(8,$idx),$s2); 950 &mov (&DWP(12,$idx),$s3); 951 952 &mov ("esp",$_esp); 953 &popf (); 954 &set_label("enc_out"); 955 &function_end_A(); 956 &pushf (); # kludge, never executed 957 958 &set_label("enc_tail",4); 959 &mov ($s0,$key eq "edi" ? $key : ""); 960 &mov ($key,$_out); # load out 961 &push ($s0); # push ivp 962 &mov ($s1,16); 963 &sub ($s1,$s2); 964 &cmp ($key,$idx); # compare with inp 965 &je (&label("enc_in_place")); 966 &align (4); 967 &data_word(0xA4F3F689); # rep movsb # copy input 968 &jmp (&label("enc_skip_in_place")); 969 &set_label("enc_in_place"); 970 &lea ($key,&DWP(0,$key,$s2)); 971 &set_label("enc_skip_in_place"); 972 &mov ($s2,$s1); 973 &xor ($s0,$s0); 974 &align (4); 975 &data_word(0xAAF3F689); # rep stosb # zero tail 976 &pop ($key); # pop ivp 977 978 &mov ($idx,$_out); # output as input 979 &mov ($s0,&DWP(0,$key)); 980 &mov ($s1,&DWP(4,$key)); 981 &mov ($_len,16); # len=16 982 &jmp (&label("enc_loop")); # one more spin... 983 984#----------------------------- DECRYPT -----------------------------# 985&set_label("DECRYPT",16); 986 &shl ($s3,6); 987 &lea ($s3,&DWP(0,$s0,$s3)); 988 &mov ($_end,$s0); 989 &mov ($_key,$s3); 990 991 &cmp ($idx,$_out); 992 &je (&label("dec_in_place")); # in-place processing... 993 994 &mov ($key,$_ivp); # load ivp 995 &mov ($_tmp,$key); 996 997 &set_label("dec_loop",4); 998 &mov ($s0,&DWP(0,$idx)); # read input 999 &mov ($s1,&DWP(4,$idx)); 1000 &mov ($s2,&DWP(8,$idx)); 1001 &bswap ($s0); 1002 &mov ($s3,&DWP(12,$idx)); 1003 &bswap ($s1); 1004 &mov ($key,$_key); # load key 1005 &bswap ($s2); 1006 &bswap ($s3); 1007 1008 &call ("_x86_Camellia_decrypt"); 1009 1010 &mov ($key,$_tmp); # load ivp 1011 &mov ($idx,$_len); # load len 1012 1013 &bswap ($s0); 1014 &bswap ($s1); 1015 &bswap ($s2); 1016 &xor ($s0,&DWP(0,$key)); # xor iv 1017 &bswap ($s3); 1018 &xor ($s1,&DWP(4,$key)); 1019 &xor ($s2,&DWP(8,$key)); 1020 &xor ($s3,&DWP(12,$key)); 1021 1022 &sub ($idx,16); 1023 &jc (&label("dec_partial")); 1024 &mov ($_len,$idx); # save len 1025 &mov ($idx,$_inp); # load inp 1026 &mov ($key,$_out); # load out 1027 1028 &mov (&DWP(0,$key),$s0); # write output 1029 &mov (&DWP(4,$key),$s1); 1030 &mov (&DWP(8,$key),$s2); 1031 &mov (&DWP(12,$key),$s3); 1032 1033 &mov ($_tmp,$idx); # save ivp 1034 &lea ($idx,&DWP(16,$idx)); 1035 &mov ($_inp,$idx); # save inp 1036 1037 &lea ($key,&DWP(16,$key)); 1038 &mov ($_out,$key); # save out 1039 1040 &jnz (&label("dec_loop")); 1041 &mov ($key,$_tmp); # load temp ivp 1042 &set_label("dec_end"); 1043 &mov ($idx,$_ivp); # load user ivp 1044 &mov ($s0,&DWP(0,$key)); # load iv 1045 &mov ($s1,&DWP(4,$key)); 1046 &mov ($s2,&DWP(8,$key)); 1047 &mov ($s3,&DWP(12,$key)); 1048 &mov (&DWP(0,$idx),$s0); # copy back to user 1049 &mov (&DWP(4,$idx),$s1); 1050 &mov (&DWP(8,$idx),$s2); 1051 &mov (&DWP(12,$idx),$s3); 1052 &jmp (&label("dec_out")); 1053 1054 &set_label("dec_partial",4); 1055 &lea ($key,$ivec); 1056 &mov (&DWP(0,$key),$s0); # dump output to stack 1057 &mov (&DWP(4,$key),$s1); 1058 &mov (&DWP(8,$key),$s2); 1059 &mov (&DWP(12,$key),$s3); 1060 &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx)); 1061 &mov ($idx eq "esi" ? $idx : "",$key); 1062 &mov ($key eq "edi" ? $key : "",$_out); # load out 1063 &data_word(0xA4F3F689); # rep movsb # copy output 1064 &mov ($key,$_inp); # use inp as temp ivp 1065 &jmp (&label("dec_end")); 1066 1067 &set_label("dec_in_place",4); 1068 &set_label("dec_in_place_loop"); 1069 &lea ($key,$ivec); 1070 &mov ($s0,&DWP(0,$idx)); # read input 1071 &mov ($s1,&DWP(4,$idx)); 1072 &mov ($s2,&DWP(8,$idx)); 1073 &mov ($s3,&DWP(12,$idx)); 1074 1075 &mov (&DWP(0,$key),$s0); # copy to temp 1076 &mov (&DWP(4,$key),$s1); 1077 &mov (&DWP(8,$key),$s2); 1078 &bswap ($s0); 1079 &mov (&DWP(12,$key),$s3); 1080 &bswap ($s1); 1081 &mov ($key,$_key); # load key 1082 &bswap ($s2); 1083 &bswap ($s3); 1084 1085 &call ("_x86_Camellia_decrypt"); 1086 1087 &mov ($key,$_ivp); # load ivp 1088 &mov ($idx,$_out); # load out 1089 1090 &bswap ($s0); 1091 &bswap ($s1); 1092 &bswap ($s2); 1093 &xor ($s0,&DWP(0,$key)); # xor iv 1094 &bswap ($s3); 1095 &xor ($s1,&DWP(4,$key)); 1096 &xor ($s2,&DWP(8,$key)); 1097 &xor ($s3,&DWP(12,$key)); 1098 1099 &mov (&DWP(0,$idx),$s0); # write output 1100 &mov (&DWP(4,$idx),$s1); 1101 &mov (&DWP(8,$idx),$s2); 1102 &mov (&DWP(12,$idx),$s3); 1103 1104 &lea ($idx,&DWP(16,$idx)); 1105 &mov ($_out,$idx); # save out 1106 1107 &lea ($idx,$ivec); 1108 &mov ($s0,&DWP(0,$idx)); # read temp 1109 &mov ($s1,&DWP(4,$idx)); 1110 &mov ($s2,&DWP(8,$idx)); 1111 &mov ($s3,&DWP(12,$idx)); 1112 1113 &mov (&DWP(0,$key),$s0); # copy iv 1114 &mov (&DWP(4,$key),$s1); 1115 &mov (&DWP(8,$key),$s2); 1116 &mov (&DWP(12,$key),$s3); 1117 1118 &mov ($idx,$_inp); # load inp 1119 1120 &lea ($idx,&DWP(16,$idx)); 1121 &mov ($_inp,$idx); # save inp 1122 1123 &mov ($s2,$_len); # load len 1124 &sub ($s2,16); 1125 &jc (&label("dec_in_place_partial")); 1126 &mov ($_len,$s2); # save len 1127 &jnz (&label("dec_in_place_loop")); 1128 &jmp (&label("dec_out")); 1129 1130 &set_label("dec_in_place_partial",4); 1131 # one can argue if this is actually required... 1132 &mov ($key eq "edi" ? $key : "",$_out); 1133 &lea ($idx eq "esi" ? $idx : "",$ivec); 1134 &lea ($key,&DWP(0,$key,$s2)); 1135 &lea ($idx,&DWP(16,$idx,$s2)); 1136 &neg ($s2 eq "ecx" ? $s2 : ""); 1137 &data_word(0xA4F3F689); # rep movsb # restore tail 1138 1139 &set_label("dec_out",4); 1140 &mov ("esp",$_esp); 1141 &popf (); 1142&function_end("Camellia_cbc_encrypt"); 1143} 1144 1145&asciz("Camellia for x86 by <appro\@openssl.org>"); 1146 1147&asm_finish(); 1148 1149close STDOUT or die "error closing STDOUT: $!"; 1150