1#! /usr/bin/env perl 2# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# ==================================================================== 10# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke 11# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for 12# the OpenSSL project. 13# ==================================================================== 14 15# 16# Fixed length (n=6), unrolled PPC Montgomery Multiplication 17# 18 19# 2021 20# 21# Although this is a generic implementation for unrolling Montgomery 22# Multiplication for arbitrary values of n, this is currently only 23# used for n = 6 to improve the performance of ECC p384. 24# 25# Unrolling allows intermediate results to be stored in registers, 26# rather than on the stack, improving performance by ~7% compared to 27# the existing PPC assembly code. 28# 29# The ISA 3.0 implementation uses combination multiply/add 30# instructions (maddld, maddhdu) to improve performance by an 31# additional ~10% on Power 9. 32# 33# Finally, saving non-volatile registers into volatile vector 34# registers instead of onto the stack saves a little more. 35# 36# On a Power 9 machine we see an overall improvement of ~18%. 37# 38 39use strict; 40use warnings; 41 42my ($flavour, $output, $dir, $xlate); 43 44# $output is the last argument if it looks like a file (it has an extension) 45# $flavour is the first argument if it doesn't look like a file 46$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 47$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 48 49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 50( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 51( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 52die "can't locate ppc-xlate.pl"; 53 54open STDOUT,"| $^X $xlate $flavour \"$output\"" 55 or die "can't call $xlate: $!"; 56 57if ($flavour !~ /64/) { 58 die "bad flavour ($flavour) - only ppc64 permitted"; 59} 60 61my $SIZE_T= 8; 62 63# Registers are global so the code is remotely readable 64 65# Parameters for Montgomery multiplication 66my $ze = "r0"; 67my $sp = "r1"; 68my $toc = "r2"; 69my $rp = "r3"; 70my $ap = "r4"; 71my $bp = "r5"; 72my $np = "r6"; 73my $n0 = "r7"; 74my $num = "r8"; 75 76my $i = "r9"; 77my $c0 = "r10"; 78my $bp0 = "r11"; 79my $bpi = "r11"; 80my $bpj = "r11"; 81my $tj = "r12"; 82my $apj = "r12"; 83my $npj = "r12"; 84my $lo = "r14"; 85my $c1 = "r14"; 86 87# Non-volatile registers used for tp[i] 88# 89# 12 registers are available but the limit on unrolling is 10, 90# since registers from $tp[0] to $tp[$n+1] are used. 91my @tp = ("r20" .. "r31"); 92 93# volatile VSRs for saving non-volatile GPRs - faster than stack 94my @vsrs = ("v32" .. "v46"); 95 96package Mont; 97 98sub new($$) 99{ 100 my ($class, $n) = @_; 101 102 if ($n > 10) { 103 die "Can't unroll for BN length ${n} (maximum 10)" 104 } 105 106 my $self = { 107 code => "", 108 n => $n, 109 }; 110 bless $self, $class; 111 112 return $self; 113} 114 115sub add_code($$) 116{ 117 my ($self, $c) = @_; 118 119 $self->{code} .= $c; 120} 121 122sub get_code($) 123{ 124 my ($self) = @_; 125 126 return $self->{code}; 127} 128 129sub get_function_name($) 130{ 131 my ($self) = @_; 132 133 return "bn_mul_mont_fixed_n" . $self->{n}; 134} 135 136sub get_label($$) 137{ 138 my ($self, $l) = @_; 139 140 return "L" . $l . "_" . $self->{n}; 141} 142 143sub get_labels($@) 144{ 145 my ($self, @labels) = @_; 146 147 my %out = (); 148 149 foreach my $l (@labels) { 150 $out{"$l"} = $self->get_label("$l"); 151 } 152 153 return \%out; 154} 155 156sub nl($) 157{ 158 my ($self) = @_; 159 160 $self->add_code("\n"); 161} 162 163sub copy_result($) 164{ 165 my ($self) = @_; 166 167 my ($n) = $self->{n}; 168 169 for (my $j = 0; $j < $n; $j++) { 170 $self->add_code(<<___); 171 std $tp[$j],`$j*$SIZE_T`($rp) 172___ 173 } 174 175} 176 177sub mul_mont_fixed($) 178{ 179 my ($self) = @_; 180 181 my ($n) = $self->{n}; 182 my $fname = $self->get_function_name(); 183 my $label = $self->get_labels("outer", "enter", "sub", "copy", "end"); 184 185 $self->add_code(<<___); 186 187.globl .${fname} 188.align 5 189.${fname}: 190 191___ 192 193 $self->save_registers(); 194 195 $self->add_code(<<___); 196 li $ze,0 197 ld $n0,0($n0) 198 199 ld $bp0,0($bp) 200 201 ld $apj,0($ap) 202___ 203 204 $self->mul_c_0($tp[0], $apj, $bp0, $c0); 205 206 for (my $j = 1; $j < $n - 1; $j++) { 207 $self->add_code(<<___); 208 ld $apj,`$j*$SIZE_T`($ap) 209___ 210 $self->mul($tp[$j], $apj, $bp0, $c0); 211 } 212 213 $self->add_code(<<___); 214 ld $apj,`($n-1)*$SIZE_T`($ap) 215___ 216 217 $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0); 218 219 $self->add_code(<<___); 220 li $tp[$n+1],0 221 222___ 223 224 $self->add_code(<<___); 225 li $i,0 226 mtctr $num 227 b $label->{"enter"} 228 229.align 4 230$label->{"outer"}: 231 ldx $bpi,$bp,$i 232 233 ld $apj,0($ap) 234___ 235 236 $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0); 237 238 for (my $j = 1; $j < $n; $j++) { 239 $self->add_code(<<___); 240 ld $apj,`$j*$SIZE_T`($ap) 241___ 242 $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0); 243 } 244 245 $self->add_code(<<___); 246 addc $tp[$n],$tp[$n],$c0 247 addze $tp[$n+1],$ze 248___ 249 250 $self->add_code(<<___); 251.align 4 252$label->{"enter"}: 253 mulld $bpi,$tp[0],$n0 254 255 ld $npj,0($np) 256___ 257 258 $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0); 259 260 for (my $j = 1; $j < $n; $j++) { 261 $self->add_code(<<___); 262 ld $npj,`$j*$SIZE_T`($np) 263___ 264 $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0); 265 } 266 267 $self->add_code(<<___); 268 addc $tp[$n-1],$tp[$n],$c0 269 addze $tp[$n],$tp[$n+1] 270 271 addi $i,$i,$SIZE_T 272 bdnz $label->{"outer"} 273 274 and. $tp[$n],$tp[$n],$tp[$n] 275 bne $label->{"sub"} 276 277 cmpld $tp[$n-1],$npj 278 blt $label->{"copy"} 279 280$label->{"sub"}: 281___ 282 283 # 284 # Reduction 285 # 286 287 $self->add_code(<<___); 288 ld $bpj,`0*$SIZE_T`($np) 289 subfc $c1,$bpj,$tp[0] 290 std $c1,`0*$SIZE_T`($rp) 291 292___ 293 for (my $j = 1; $j < $n - 1; $j++) { 294 $self->add_code(<<___); 295 ld $bpj,`$j*$SIZE_T`($np) 296 subfe $c1,$bpj,$tp[$j] 297 std $c1,`$j*$SIZE_T`($rp) 298 299___ 300 } 301 302 $self->add_code(<<___); 303 subfe $c1,$npj,$tp[$n-1] 304 std $c1,`($n-1)*$SIZE_T`($rp) 305 306___ 307 308 $self->add_code(<<___); 309 addme. $tp[$n],$tp[$n] 310 beq $label->{"end"} 311 312$label->{"copy"}: 313___ 314 315 $self->copy_result(); 316 317 $self->add_code(<<___); 318 319$label->{"end"}: 320___ 321 322 $self->restore_registers(); 323 324 $self->add_code(<<___); 325 li r3,1 326 blr 327.size .${fname},.-.${fname} 328___ 329 330} 331 332package Mont::GPR; 333 334our @ISA = ('Mont'); 335 336sub new($$) 337{ 338 my ($class, $n) = @_; 339 340 return $class->SUPER::new($n); 341} 342 343sub save_registers($) 344{ 345 my ($self) = @_; 346 347 my $n = $self->{n}; 348 349 $self->add_code(<<___); 350 std $lo,-8($sp) 351___ 352 353 for (my $j = 0; $j <= $n+1; $j++) { 354 $self->{code}.=<<___; 355 std $tp[$j],-`($j+2)*8`($sp) 356___ 357 } 358 359 $self->add_code(<<___); 360 361___ 362} 363 364sub restore_registers($) 365{ 366 my ($self) = @_; 367 368 my $n = $self->{n}; 369 370 $self->add_code(<<___); 371 ld $lo,-8($sp) 372___ 373 374 for (my $j = 0; $j <= $n+1; $j++) { 375 $self->{code}.=<<___; 376 ld $tp[$j],-`($j+2)*8`($sp) 377___ 378 } 379 380 $self->{code} .=<<___; 381 382___ 383} 384 385# Direct translation of C mul() 386sub mul($$$$$) 387{ 388 my ($self, $r, $a, $w, $c) = @_; 389 390 $self->add_code(<<___); 391 mulld $lo,$a,$w 392 addc $r,$lo,$c 393 mulhdu $c,$a,$w 394 addze $c,$c 395 396___ 397} 398 399# Like mul() but $c is ignored as an input - an optimisation to save a 400# preliminary instruction that would set input $c to 0 401sub mul_c_0($$$$$) 402{ 403 my ($self, $r, $a, $w, $c) = @_; 404 405 $self->add_code(<<___); 406 mulld $r,$a,$w 407 mulhdu $c,$a,$w 408 409___ 410} 411 412# Like mul() but does not to the final addition of CA into $c - an 413# optimisation to save an instruction 414sub mul_last($$$$$$) 415{ 416 my ($self, $r1, $r2, $a, $w, $c) = @_; 417 418 $self->add_code(<<___); 419 mulld $lo,$a,$w 420 addc $r1,$lo,$c 421 mulhdu $c,$a,$w 422 423 addze $r2,$c 424___ 425} 426 427# Like C mul_add() but allow $r_out and $r_in to be different 428sub mul_add($$$$$$) 429{ 430 my ($self, $r_out, $r_in, $a, $w, $c) = @_; 431 432 $self->add_code(<<___); 433 mulld $lo,$a,$w 434 addc $lo,$lo,$c 435 mulhdu $c,$a,$w 436 addze $c,$c 437 addc $r_out,$r_in,$lo 438 addze $c,$c 439 440___ 441} 442 443# Like mul_add() but $c is ignored as an input - an optimisation to save a 444# preliminary instruction that would set input $c to 0 445sub mul_add_c_0($$$$$$) 446{ 447 my ($self, $r_out, $r_in, $a, $w, $c) = @_; 448 449 $self->add_code(<<___); 450 mulld $lo,$a,$w 451 addc $r_out,$r_in,$lo 452 mulhdu $c,$a,$w 453 addze $c,$c 454 455___ 456} 457 458package Mont::GPR_300; 459 460our @ISA = ('Mont::GPR'); 461 462sub new($$) 463{ 464 my ($class, $n) = @_; 465 466 my $mont = $class->SUPER::new($n); 467 468 return $mont; 469} 470 471sub get_function_name($) 472{ 473 my ($self) = @_; 474 475 return "bn_mul_mont_300_fixed_n" . $self->{n}; 476} 477 478sub get_label($$) 479{ 480 my ($self, $l) = @_; 481 482 return "L" . $l . "_300_" . $self->{n}; 483} 484 485# Direct translation of C mul() 486sub mul($$$$$) 487{ 488 my ($self, $r, $a, $w, $c, $last) = @_; 489 490 $self->add_code(<<___); 491 maddld $r,$a,$w,$c 492 maddhdu $c,$a,$w,$c 493 494___ 495} 496 497# Save the last carry as the final entry 498sub mul_last($$$$$) 499{ 500 my ($self, $r1, $r2, $a, $w, $c) = @_; 501 502 $self->add_code(<<___); 503 maddld $r1,$a,$w,$c 504 maddhdu $r2,$a,$w,$c 505 506___ 507} 508 509# Like mul() but $c is ignored as an input - an optimisation to save a 510# preliminary instruction that would set input $c to 0 511sub mul_c_0($$$$$) 512{ 513 my ($self, $r, $a, $w, $c) = @_; 514 515 $self->add_code(<<___); 516 mulld $r,$a,$w 517 mulhdu $c,$a,$w 518 519___ 520} 521 522# Like C mul_add() but allow $r_out and $r_in to be different 523sub mul_add($$$$$$) 524{ 525 my ($self, $r_out, $r_in, $a, $w, $c) = @_; 526 527 $self->add_code(<<___); 528 maddld $lo,$a,$w,$c 529 maddhdu $c,$a,$w,$c 530 addc $r_out,$r_in,$lo 531 addze $c,$c 532 533___ 534} 535 536# Like mul_add() but $c is ignored as an input - an optimisation to save a 537# preliminary instruction that would set input $c to 0 538sub mul_add_c_0($$$$$$) 539{ 540 my ($self, $r_out, $r_in, $a, $w, $c) = @_; 541 542 $self->add_code(<<___); 543 maddld $lo,$a,$w,$r_in 544 maddhdu $c,$a,$w,$r_in 545___ 546 547 if ($r_out ne $lo) { 548 $self->add_code(<<___); 549 mr $r_out,$lo 550___ 551 } 552 553 $self->nl(); 554} 555 556 557package main; 558 559my $code; 560 561$code.=<<___; 562.machine "any" 563.text 564___ 565 566my $mont; 567 568$mont = new Mont::GPR(6); 569$mont->mul_mont_fixed(); 570$code .= $mont->get_code(); 571 572$mont = new Mont::GPR_300(6); 573$mont->mul_mont_fixed(); 574$code .= $mont->get_code(); 575 576$code =~ s/\`([^\`]*)\`/eval $1/gem; 577 578$code.=<<___; 579.asciz "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>" 580___ 581 582print $code; 583close STDOUT or die "error closing STDOUT: $!"; 584