1! Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. 2! 3! Licensed under the Apache License 2.0 (the "License"). You may not use 4! this file except in compliance with the License. You can obtain a copy 5! in the file LICENSE in the source distribution or at 6! https://www.openssl.org/source/license.html 7 8#if defined(__SUNPRO_C) && defined(__sparcv9) 9# define ABI64 /* They've said -xarch=v9 at command line */ 10#elif defined(__GNUC__) && defined(__arch64__) 11# define ABI64 /* They've said -m64 at command line */ 12#endif 13 14#ifdef ABI64 15 .register %g2,#scratch 16 .register %g3,#scratch 17# define FRAME -192 18# define BIAS 2047 19#else 20# define FRAME -96 21# define BIAS 0 22#endif 23 24.text 25.align 32 26.global OPENSSL_wipe_cpu 27.type OPENSSL_wipe_cpu,#function 28! Keep in mind that this does not excuse us from wiping the stack! 29! This routine wipes registers, but not the backing store [which 30! resides on the stack, toward lower addresses]. To facilitate for 31! stack wiping I return pointer to the top of stack of the *caller*. 32OPENSSL_wipe_cpu: 33 save %sp,FRAME,%sp 34 nop 35#ifdef __sun 36#include <sys/trap.h> 37 ta ST_CLEAN_WINDOWS 38#else 39 call .walk.reg.wins 40#endif 41 nop 42 call .PIC.zero.up 43 mov .zero-(.-4),%o0 44 ld [%o0],%f0 45 ld [%o0],%f1 46 47 subcc %g0,1,%o0 48 ! Following is V9 "rd %ccr,%o0" instruction. However! V8 49 ! specification says that it ("rd %asr2,%o0" in V8 terms) does 50 ! not cause illegal_instruction trap. It therefore can be used 51 ! to determine if the CPU the code is executing on is V8- or 52 ! V9-compliant, as V9 returns a distinct value of 0x99, 53 ! "negative" and "borrow" bits set in both %icc and %xcc. 54 .word 0x91408000 !rd %ccr,%o0 55 cmp %o0,0x99 56 bne .v8 57 nop 58 ! Even though we do not use %fp register bank, 59 ! we wipe it as memcpy might have used it... 60 .word 0xbfa00040 !fmovd %f0,%f62 61 .word 0xbba00040 !... 62 .word 0xb7a00040 63 .word 0xb3a00040 64 .word 0xafa00040 65 .word 0xaba00040 66 .word 0xa7a00040 67 .word 0xa3a00040 68 .word 0x9fa00040 69 .word 0x9ba00040 70 .word 0x97a00040 71 .word 0x93a00040 72 .word 0x8fa00040 73 .word 0x8ba00040 74 .word 0x87a00040 75 .word 0x83a00040 !fmovd %f0,%f32 76.v8: fmovs %f1,%f31 77 clr %o0 78 fmovs %f0,%f30 79 clr %o1 80 fmovs %f1,%f29 81 clr %o2 82 fmovs %f0,%f28 83 clr %o3 84 fmovs %f1,%f27 85 clr %o4 86 fmovs %f0,%f26 87 clr %o5 88 fmovs %f1,%f25 89 clr %o7 90 fmovs %f0,%f24 91 clr %l0 92 fmovs %f1,%f23 93 clr %l1 94 fmovs %f0,%f22 95 clr %l2 96 fmovs %f1,%f21 97 clr %l3 98 fmovs %f0,%f20 99 clr %l4 100 fmovs %f1,%f19 101 clr %l5 102 fmovs %f0,%f18 103 clr %l6 104 fmovs %f1,%f17 105 clr %l7 106 fmovs %f0,%f16 107 clr %i0 108 fmovs %f1,%f15 109 clr %i1 110 fmovs %f0,%f14 111 clr %i2 112 fmovs %f1,%f13 113 clr %i3 114 fmovs %f0,%f12 115 clr %i4 116 fmovs %f1,%f11 117 clr %i5 118 fmovs %f0,%f10 119 clr %g1 120 fmovs %f1,%f9 121 clr %g2 122 fmovs %f0,%f8 123 clr %g3 124 fmovs %f1,%f7 125 clr %g4 126 fmovs %f0,%f6 127 clr %g5 128 fmovs %f1,%f5 129 fmovs %f0,%f4 130 fmovs %f1,%f3 131 fmovs %f0,%f2 132 133 add %fp,BIAS,%i0 ! return pointer to caller´s top of stack 134 135 ret 136 restore 137 138.zero: .long 0x0,0x0 139.PIC.zero.up: 140 retl 141 add %o0,%o7,%o0 142#ifdef DEBUG 143.global walk_reg_wins 144.type walk_reg_wins,#function 145walk_reg_wins: 146#endif 147.walk.reg.wins: 148 save %sp,FRAME,%sp 149 cmp %i7,%o7 150 be 2f 151 clr %o0 152 cmp %o7,0 ! compiler never cleans %o7... 153 be 1f ! could have been a leaf function... 154 clr %o1 155 call .walk.reg.wins 156 nop 1571: clr %o2 158 clr %o3 159 clr %o4 160 clr %o5 161 clr %o7 162 clr %l0 163 clr %l1 164 clr %l2 165 clr %l3 166 clr %l4 167 clr %l5 168 clr %l6 169 clr %l7 170 add %o0,1,%i0 ! used for debugging 1712: ret 172 restore 173.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu 174 175.global OPENSSL_atomic_add 176.type OPENSSL_atomic_add,#function 177.align 32 178OPENSSL_atomic_add: 179#ifndef ABI64 180 subcc %g0,1,%o2 181 .word 0x95408000 !rd %ccr,%o2, see comment above 182 cmp %o2,0x99 183 be .v9 184 nop 185 save %sp,FRAME,%sp 186 ba .enter 187 nop 188#ifdef __sun 189! Note that you do not have to link with libthread to call thr_yield, 190! as libc provides a stub, which is overloaded the moment you link 191! with *either* libpthread or libthread... 192#define YIELD_CPU thr_yield 193#else 194! applies at least to Linux and FreeBSD... Feedback expected... 195#define YIELD_CPU sched_yield 196#endif 197.spin: call YIELD_CPU 198 nop 199.enter: ld [%i0],%i2 200 cmp %i2,-4096 201 be .spin 202 mov -1,%i2 203 swap [%i0],%i2 204 cmp %i2,-1 205 be .spin 206 add %i2,%i1,%i2 207 stbar 208 st %i2,[%i0] 209 sra %i2,%g0,%i0 210 ret 211 restore 212.v9: 213#endif 214 ld [%o0],%o2 2151: add %o1,%o2,%o3 216 .word 0xd7e2100a !cas [%o0],%o2,%o3, compare [%o0] with %o2 and swap %o3 217 cmp %o2,%o3 218 bne 1b 219 mov %o3,%o2 ! cas is always fetching to dest. register 220 add %o1,%o2,%o0 ! OpenSSL expects the new value 221 retl 222 sra %o0,%g0,%o0 ! we return signed int, remember? 223.size OPENSSL_atomic_add,.-OPENSSL_atomic_add 224 225.global _sparcv9_rdtick 226.align 32 227_sparcv9_rdtick: 228 subcc %g0,1,%o0 229 .word 0x91408000 !rd %ccr,%o0 230 cmp %o0,0x99 231 bne .notick 232 xor %o0,%o0,%o0 233 .word 0x91410000 !rd %tick,%o0 234 retl 235 .word 0x93323020 !srlx %o0,32,%o1 236.notick: 237 retl 238 xor %o1,%o1,%o1 239.type _sparcv9_rdtick,#function 240.size _sparcv9_rdtick,.-_sparcv9_rdtick 241 242.global _sparcv9_vis1_probe 243.align 8 244_sparcv9_vis1_probe: 245 add %sp,BIAS+2,%o1 246 .word 0xc19a5a40 !ldda [%o1]ASI_FP16_P,%f0 247 retl 248 .word 0x81b00d80 !fxor %f0,%f0,%f0 249.type _sparcv9_vis1_probe,#function 250.size _sparcv9_vis1_probe,.-_sparcv9_vis1_probe 251 252! Probe and instrument VIS1 instruction. Output is number of cycles it 253! takes to execute rdtick and pair of VIS1 instructions. US-Tx VIS unit 254! is slow (documented to be 6 cycles on T2) and the core is in-order 255! single-issue, it should be possible to distinguish Tx reliably... 256! Observed return values are: 257! 258! UltraSPARC IIe 7 259! UltraSPARC III 7 260! UltraSPARC T1 24 261! SPARC T4 65(*) 262! 263! (*) result has lesser to do with VIS instruction latencies, rdtick 264! appears that slow, but it does the trick in sense that FP and 265! VIS code paths are still slower than integer-only ones. 266! 267! Numbers for T2 and SPARC64 V-VII are more than welcomed. 268! 269! It would be possible to detect specifically US-T1 by instrumenting 270! fmul8ulx16, which is emulated on T1 and as such accounts for quite 271! a lot of %tick-s, couple of thousand on Linux... 272.global _sparcv9_vis1_instrument 273.align 8 274_sparcv9_vis1_instrument: 275 .word 0x81b00d80 !fxor %f0,%f0,%f0 276 .word 0x85b08d82 !fxor %f2,%f2,%f2 277 .word 0x91410000 !rd %tick,%o0 278 .word 0x81b00d80 !fxor %f0,%f0,%f0 279 .word 0x85b08d82 !fxor %f2,%f2,%f2 280 .word 0x93410000 !rd %tick,%o1 281 .word 0x81b00d80 !fxor %f0,%f0,%f0 282 .word 0x85b08d82 !fxor %f2,%f2,%f2 283 .word 0x95410000 !rd %tick,%o2 284 .word 0x81b00d80 !fxor %f0,%f0,%f0 285 .word 0x85b08d82 !fxor %f2,%f2,%f2 286 .word 0x97410000 !rd %tick,%o3 287 .word 0x81b00d80 !fxor %f0,%f0,%f0 288 .word 0x85b08d82 !fxor %f2,%f2,%f2 289 .word 0x99410000 !rd %tick,%o4 290 291 ! calculate intervals 292 sub %o1,%o0,%o0 293 sub %o2,%o1,%o1 294 sub %o3,%o2,%o2 295 sub %o4,%o3,%o3 296 297 ! find minimum value 298 cmp %o0,%o1 299 .word 0x38680002 !bgu,a %xcc,.+8 300 mov %o1,%o0 301 cmp %o0,%o2 302 .word 0x38680002 !bgu,a %xcc,.+8 303 mov %o2,%o0 304 cmp %o0,%o3 305 .word 0x38680002 !bgu,a %xcc,.+8 306 mov %o3,%o0 307 308 retl 309 nop 310.type _sparcv9_vis1_instrument,#function 311.size _sparcv9_vis1_instrument,.-_sparcv9_vis1_instrument 312 313.global _sparcv9_vis2_probe 314.align 8 315_sparcv9_vis2_probe: 316 retl 317 .word 0x81b00980 !bshuffle %f0,%f0,%f0 318.type _sparcv9_vis2_probe,#function 319.size _sparcv9_vis2_probe,.-_sparcv9_vis2_probe 320 321.global _sparcv9_fmadd_probe 322.align 8 323_sparcv9_fmadd_probe: 324 .word 0x81b00d80 !fxor %f0,%f0,%f0 325 .word 0x85b08d82 !fxor %f2,%f2,%f2 326 retl 327 .word 0x81b80440 !fmaddd %f0,%f0,%f2,%f0 328.type _sparcv9_fmadd_probe,#function 329.size _sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe 330 331.global _sparcv9_rdcfr 332.align 8 333_sparcv9_rdcfr: 334 retl 335 .word 0x91468000 !rd %asr26,%o0 336.type _sparcv9_rdcfr,#function 337.size _sparcv9_rdcfr,.-_sparcv9_rdcfr 338 339.global _sparcv9_vis3_probe 340.align 8 341_sparcv9_vis3_probe: 342 retl 343 .word 0x81b022a0 !xmulx %g0,%g0,%g0 344.type _sparcv9_vis3_probe,#function 345.size _sparcv9_vis3_probe,.-_sparcv9_vis3_probe 346 347.global _sparcv9_random 348.align 8 349_sparcv9_random: 350 retl 351 .word 0x91b002a0 !random %o0 352.type _sparcv9_random,#function 353.size _sparcv9_random,.-_sparcv9_vis3_probe 354 355.global _sparcv9_fjaesx_probe 356.align 8 357_sparcv9_fjaesx_probe: 358 .word 0x81b09206 !faesencx %f2,%f6,%f0 359 retl 360 nop 361.size _sparcv9_fjaesx_probe,.-_sparcv9_fjaesx_probe 362 363.global OPENSSL_cleanse 364.align 32 365OPENSSL_cleanse: 366 cmp %o1,14 367 nop 368#ifdef ABI64 369 bgu %xcc,.Lot 370#else 371 bgu .Lot 372#endif 373 cmp %o1,0 374 bne .Little 375 nop 376 retl 377 nop 378 379.Little: 380 stb %g0,[%o0] 381 subcc %o1,1,%o1 382 bnz .Little 383 add %o0,1,%o0 384 retl 385 nop 386.align 32 387.Lot: 388#ifndef ABI64 389 subcc %g0,1,%g1 390 ! see above for explanation 391 .word 0x83408000 !rd %ccr,%g1 392 cmp %g1,0x99 393 bne .v8lot 394 nop 395#endif 396 397.v9lot: andcc %o0,7,%g0 398 bz .v9aligned 399 nop 400 stb %g0,[%o0] 401 sub %o1,1,%o1 402 ba .v9lot 403 add %o0,1,%o0 404.align 16,0x01000000 405.v9aligned: 406 .word 0xc0720000 !stx %g0,[%o0] 407 sub %o1,8,%o1 408 andcc %o1,-8,%g0 409#ifdef ABI64 410 .word 0x126ffffd !bnz %xcc,.v9aligned 411#else 412 .word 0x124ffffd !bnz %icc,.v9aligned 413#endif 414 add %o0,8,%o0 415 416 cmp %o1,0 417 bne .Little 418 nop 419 retl 420 nop 421#ifndef ABI64 422.v8lot: andcc %o0,3,%g0 423 bz .v8aligned 424 nop 425 stb %g0,[%o0] 426 sub %o1,1,%o1 427 ba .v8lot 428 add %o0,1,%o0 429 nop 430.v8aligned: 431 st %g0,[%o0] 432 sub %o1,4,%o1 433 andcc %o1,-4,%g0 434 bnz .v8aligned 435 add %o0,4,%o0 436 437 cmp %o1,0 438 bne .Little 439 nop 440 retl 441 nop 442#endif 443.type OPENSSL_cleanse,#function 444.size OPENSSL_cleanse,.-OPENSSL_cleanse 445 446.global CRYPTO_memcmp 447.align 16 448CRYPTO_memcmp: 449 cmp %o2,0 450#ifdef ABI64 451 beq,pn %xcc,.Lno_data 452#else 453 beq .Lno_data 454#endif 455 xor %g1,%g1,%g1 456 nop 457 458.Loop_cmp: 459 ldub [%o0],%o3 460 add %o0,1,%o0 461 ldub [%o1],%o4 462 add %o1,1,%o1 463 subcc %o2,1,%o2 464 xor %o3,%o4,%o4 465#ifdef ABI64 466 bnz %xcc,.Loop_cmp 467#else 468 bnz .Loop_cmp 469#endif 470 or %o4,%g1,%g1 471 472 sub %g0,%g1,%g1 473 srl %g1,31,%g1 474.Lno_data: 475 retl 476 mov %g1,%o0 477.type CRYPTO_memcmp,#function 478.size CRYPTO_memcmp,.-CRYPTO_memcmp 479 480.global _sparcv9_vis1_instrument_bus 481.align 8 482_sparcv9_vis1_instrument_bus: 483 mov %o1,%o3 ! save cnt 484 .word 0x99410000 !rd %tick,%o4 ! tick 485 mov %o4,%o5 ! lasttick = tick 486 set 0,%g4 ! diff 487 488 andn %o0,63,%g1 489 .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load 490 .word 0x8143e040 !membar #Sync 491 .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit 492 .word 0x8143e040 !membar #Sync 493 ld [%o0],%o4 494 add %o4,%g4,%g4 495 .word 0xc9e2100c !cas [%o0],%o4,%g4 496 497.Loop: .word 0x99410000 !rd %tick,%o4 498 sub %o4,%o5,%g4 ! diff=tick-lasttick 499 mov %o4,%o5 ! lasttick=tick 500 501 andn %o0,63,%g1 502 .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load 503 .word 0x8143e040 !membar #Sync 504 .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit 505 .word 0x8143e040 !membar #Sync 506 ld [%o0],%o4 507 add %o4,%g4,%g4 508 .word 0xc9e2100c !cas [%o0],%o4,%g4 509 subcc %o1,1,%o1 ! --$cnt 510 bnz .Loop 511 add %o0,4,%o0 ! ++$out 512 513 retl 514 mov %o3,%o0 515.type _sparcv9_vis1_instrument_bus,#function 516.size _sparcv9_vis1_instrument_bus,.-_sparcv9_vis1_instrument_bus 517 518.global _sparcv9_vis1_instrument_bus2 519.align 8 520_sparcv9_vis1_instrument_bus2: 521 mov %o1,%o3 ! save cnt 522 sll %o1,2,%o1 ! cnt*=4 523 524 .word 0x99410000 !rd %tick,%o4 ! tick 525 mov %o4,%o5 ! lasttick = tick 526 set 0,%g4 ! diff 527 528 andn %o0,63,%g1 529 .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load 530 .word 0x8143e040 !membar #Sync 531 .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit 532 .word 0x8143e040 !membar #Sync 533 ld [%o0],%o4 534 add %o4,%g4,%g4 535 .word 0xc9e2100c !cas [%o0],%o4,%g4 536 537 .word 0x99410000 !rd %tick,%o4 ! tick 538 sub %o4,%o5,%g4 ! diff=tick-lasttick 539 mov %o4,%o5 ! lasttick=tick 540 mov %g4,%g5 ! lastdiff=diff 541.Loop2: 542 andn %o0,63,%g1 543 .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load 544 .word 0x8143e040 !membar #Sync 545 .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit 546 .word 0x8143e040 !membar #Sync 547 ld [%o0],%o4 548 add %o4,%g4,%g4 549 .word 0xc9e2100c !cas [%o0],%o4,%g4 550 551 subcc %o2,1,%o2 ! --max 552 bz .Ldone2 553 nop 554 555 .word 0x99410000 !rd %tick,%o4 ! tick 556 sub %o4,%o5,%g4 ! diff=tick-lasttick 557 mov %o4,%o5 ! lasttick=tick 558 cmp %g4,%g5 559 mov %g4,%g5 ! lastdiff=diff 560 561 .word 0x83408000 !rd %ccr,%g1 562 and %g1,4,%g1 ! isolate zero flag 563 xor %g1,4,%g1 ! flip zero flag 564 565 subcc %o1,%g1,%o1 ! conditional --$cnt 566 bnz .Loop2 567 add %o0,%g1,%o0 ! conditional ++$out 568 569.Ldone2: 570 srl %o1,2,%o1 571 retl 572 sub %o3,%o1,%o0 573.type _sparcv9_vis1_instrument_bus2,#function 574.size _sparcv9_vis1_instrument_bus2,.-_sparcv9_vis1_instrument_bus2 575 576.section ".init",#alloc,#execinstr 577 call OPENSSL_cpuid_setup 578 nop 579