1#! /usr/bin/env perl 2# Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Amitay Isaacs <amitay@ozlabs.org> and Martin Schwenke 11# <martin@meltin.net> for the OpenSSL project. 12# ==================================================================== 13# 14# p521 lower-level primitives for PPC64 using vector instructions. 15# 16 17use strict; 18use warnings; 19 20my $flavour = shift; 21my $output = ""; 22while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 23if (!$output) { 24 $output = "-"; 25} 26 27my ($xlate, $dir); 28$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 29( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 30( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 31die "can't locate ppc-xlate.pl"; 32 33open OUT,"| \"$^X\" $xlate $flavour $output"; 34*STDOUT=*OUT; 35 36my $code = ""; 37 38my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12"); 39 40my $vzero = "v32"; 41 42sub startproc($) 43{ 44 my ($name) = @_; 45 46 $code.=<<___; 47 .globl ${name} 48 .align 5 49${name}: 50 51___ 52} 53 54sub endproc($) 55{ 56 my ($name) = @_; 57 58 $code.=<<___; 59 blr 60 .size ${name},.-${name} 61 62___ 63} 64 65 66sub push_vrs($$) 67{ 68 my ($min, $max) = @_; 69 70 my $count = $max - $min + 1; 71 72 $code.=<<___; 73 mr $savesp,$sp 74 stdu $sp,-16*`$count+1`($sp) 75 76___ 77 for (my $i = $min; $i <= $max; $i++) { 78 my $mult = $max - $i + 1; 79 $code.=<<___; 80 stxv $i,-16*$mult($savesp) 81___ 82 83 } 84 85 $code.=<<___; 86 87___ 88} 89 90sub pop_vrs($$) 91{ 92 my ($min, $max) = @_; 93 94 $code.=<<___; 95 ld $savesp,0($sp) 96___ 97 for (my $i = $min; $i <= $max; $i++) { 98 my $mult = $max - $i + 1; 99 $code.=<<___; 100 lxv $i,-16*$mult($savesp) 101___ 102 } 103 104 $code.=<<___; 105 mr $sp,$savesp 106 107___ 108} 109 110sub load_vrs($$) 111{ 112 my ($pointer, $reg_list) = @_; 113 114 for (my $i = 0; $i <= 8; $i++) { 115 my $offset = $i * 8; 116 $code.=<<___; 117 lxsd $reg_list->[$i],$offset($pointer) 118___ 119 } 120 121 $code.=<<___; 122 123___ 124} 125 126sub store_vrs($$) 127{ 128 my ($pointer, $reg_list) = @_; 129 130 for (my $i = 0; $i <= 8; $i++) { 131 my $offset = $i * 16; 132 $code.=<<___; 133 stxv $reg_list->[$i],$offset($pointer) 134___ 135 } 136 137 $code.=<<___; 138 139___ 140} 141 142$code.=<<___; 143.machine "any" 144.text 145 146___ 147 148{ 149 # mul/square common 150 my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v44", "v54"); 151 my ($zero, $one) = ("r8", "r9"); 152 my @out = map("v$_",(55..63)); 153 154 { 155 # 156 # p521_felem_mul 157 # 158 159 my ($in1p, $in2p) = ("r4", "r5"); 160 my @in1 = map("v$_",(45..53)); 161 my @in2 = map("v$_",(35..43)); 162 163 startproc("p521_felem_mul"); 164 165 push_vrs(52, 63); 166 167 $code.=<<___; 168 vspltisw $vzero,0 169 170___ 171 172 load_vrs($in1p, \@in1); 173 load_vrs($in2p, \@in2); 174 175 $code.=<<___; 176 vmsumudm $out[0],$in1[0],$in2[0],$vzero 177 178 xxpermdi $t1,$in1[0],$in1[1],0b00 179 xxpermdi $t2,$in2[1],$in2[0],0b00 180 vmsumudm $out[1],$t1,$t2,$vzero 181 182 xxpermdi $t2,$in2[2],$in2[1],0b00 183 vmsumudm $out[2],$t1,$t2,$vzero 184 vmsumudm $out[2],$in1[2],$in2[0],$out[2] 185 186 xxpermdi $t2,$in2[3],$in2[2],0b00 187 vmsumudm $out[3],$t1,$t2,$vzero 188 xxpermdi $t3,$in1[2],$in1[3],0b00 189 xxpermdi $t4,$in2[1],$in2[0],0b00 190 vmsumudm $out[3],$t3,$t4,$out[3] 191 192 xxpermdi $t2,$in2[4],$in2[3],0b00 193 vmsumudm $out[4],$t1,$t2,$vzero 194 xxpermdi $t4,$in2[2],$in2[1],0b00 195 vmsumudm $out[4],$t3,$t4,$out[4] 196 vmsumudm $out[4],$in1[4],$in2[0],$out[4] 197 198 xxpermdi $t2,$in2[5],$in2[4],0b00 199 vmsumudm $out[5],$t1,$t2,$vzero 200 xxpermdi $t4,$in2[3],$in2[2],0b00 201 vmsumudm $out[5],$t3,$t4,$out[5] 202 203 xxpermdi $t2,$in2[6],$in2[5],0b00 204 vmsumudm $out[6],$t1,$t2,$vzero 205 xxpermdi $t4,$in2[4],$in2[3],0b00 206 vmsumudm $out[6],$t3,$t4,$out[6] 207 208 xxpermdi $t2,$in2[7],$in2[6],0b00 209 vmsumudm $out[7],$t1,$t2,$vzero 210 xxpermdi $t4,$in2[5],$in2[4],0b00 211 vmsumudm $out[7],$t3,$t4,$out[7] 212 213 xxpermdi $t2,$in2[8],$in2[7],0b00 214 vmsumudm $out[8],$t1,$t2,$vzero 215 xxpermdi $t4,$in2[6],$in2[5],0b00 216 vmsumudm $out[8],$t3,$t4,$out[8] 217 218 xxpermdi $t1,$in1[4],$in1[5],0b00 219 xxpermdi $t2,$in2[1],$in2[0],0b00 220 vmsumudm $out[5],$t1,$t2,$out[5] 221 222 xxpermdi $t2,$in2[2],$in2[1],0b00 223 vmsumudm $out[6],$t1,$t2,$out[6] 224 vmsumudm $out[6],$in1[6],$in2[0],$out[6] 225 226 xxpermdi $t2,$in2[3],$in2[2],0b00 227 vmsumudm $out[7],$t1,$t2,$out[7] 228 xxpermdi $t3,$in1[6],$in1[7],0b00 229 xxpermdi $t4,$in2[1],$in2[0],0b00 230 vmsumudm $out[7],$t3,$t4,$out[7] 231 232 xxpermdi $t2,$in2[4],$in2[3],0b00 233 vmsumudm $out[8],$t1,$t2,$out[8] 234 xxpermdi $t4,$in2[2],$in2[1],0b00 235 vmsumudm $out[8],$t3,$t4,$out[8] 236 vmsumudm $out[8],$in1[8],$in2[0],$out[8] 237 238 li $zero,0 239 li $one,1 240 mtvsrdd $t1,$one,$zero 241___ 242 243 for (my $i = 0; $i <= 8; $i++) { 244 $code.=<<___; 245 vsld $in2[$i],$in2[$i],$t1 246___ 247 } 248 249 $code.=<<___; 250 251 vmsumudm $out[7],$in1[8],$in2[8],$out[7] 252 253 xxpermdi $t2,$in2[8],$in2[7],0b00 254 xxpermdi $t1,$in1[7],$in1[8],0b00 255 vmsumudm $out[6],$t1,$t2,$out[6] 256 257 xxpermdi $t1,$in1[6],$in1[7],0b00 258 vmsumudm $out[5],$t1,$t2,$out[5] 259 vmsumudm $out[5],$in1[8],$in2[6],$out[5] 260 261 xxpermdi $t1,$in1[5],$in1[6],0b00 262 vmsumudm $out[4],$t1,$t2,$out[4] 263 xxpermdi $t4,$in2[6],$in2[5],0b00 264 xxpermdi $t3,$in1[7],$in1[8],0b00 265 vmsumudm $out[4],$t3,$t4,$out[4] 266 267 xxpermdi $t1,$in1[4],$in1[5],0b00 268 vmsumudm $out[3],$t1,$t2,$out[3] 269 xxpermdi $t3,$in1[6],$in1[7],0b00 270 vmsumudm $out[3],$t3,$t4,$out[3] 271 vmsumudm $out[3],$in1[8],$in2[4],$out[3] 272 273 xxpermdi $t1,$in1[3],$in1[4],0b00 274 vmsumudm $out[2],$t1,$t2,$out[2] 275 xxpermdi $t3,$in1[5],$in1[6],0b00 276 vmsumudm $out[2],$t3,$t4,$out[2] 277 278 xxpermdi $t1,$in1[2],$in1[3],0b00 279 vmsumudm $out[1],$t1,$t2,$out[1] 280 xxpermdi $t3,$in1[4],$in1[5],0b00 281 vmsumudm $out[1],$t3,$t4,$out[1] 282 283 xxpermdi $t1,$in1[1],$in1[2],0b00 284 vmsumudm $out[0],$t1,$t2,$out[0] 285 xxpermdi $t3,$in1[3],$in1[4],0b00 286 vmsumudm $out[0],$t3,$t4,$out[0] 287 288 xxpermdi $t2,$in2[4],$in2[3],0b00 289 xxpermdi $t1,$in1[7],$in1[8],0b00 290 vmsumudm $out[2],$t1,$t2,$out[2] 291 292 xxpermdi $t1,$in1[6],$in1[7],0b00 293 vmsumudm $out[1],$t1,$t2,$out[1] 294 vmsumudm $out[1],$in1[8],$in2[2],$out[1] 295 296 xxpermdi $t1,$in1[5],$in1[6],0b00 297 vmsumudm $out[0],$t1,$t2,$out[0] 298 xxpermdi $t4,$in2[2],$in2[1],0b00 299 xxpermdi $t3,$in1[7],$in1[8],0b00 300 vmsumudm $out[0],$t3,$t4,$out[0] 301 302___ 303 304 store_vrs($outp, \@out); 305 306 pop_vrs(52, 63); 307 308 endproc("p521_felem_mul"); 309 } 310 311 { 312 # 313 # p51_felem_square 314 # 315 316 my ($inp) = ("r4"); 317 my @in = map("v$_",(45..53)); 318 my @inx2 = map("v$_",(35..43)); 319 320 startproc("p521_felem_square"); 321 322 push_vrs(52, 63); 323 324 $code.=<<___; 325 vspltisw $vzero,0 326 327___ 328 329 load_vrs($inp, \@in); 330 331 $code.=<<___; 332 li $zero,0 333 li $one,1 334 mtvsrdd $t1,$one,$zero 335___ 336 337 for (my $i = 0; $i <= 8; $i++) { 338 $code.=<<___; 339 vsld $inx2[$i],$in[$i],$t1 340___ 341 } 342 343 $code.=<<___; 344 vmsumudm $out[0],$in[0],$in[0],$vzero 345 346 vmsumudm $out[1],$in[0],$inx2[1],$vzero 347 348 xxpermdi $t1,$in[0],$in[1],0b00 349 xxpermdi $t2,$inx2[2],$in[1],0b00 350 vmsumudm $out[2],$t1,$t2,$vzero 351 352 xxpermdi $t2,$inx2[3],$inx2[2],0b00 353 vmsumudm $out[3],$t1,$t2,$vzero 354 355 xxpermdi $t2,$inx2[4],$inx2[3],0b00 356 vmsumudm $out[4],$t1,$t2,$vzero 357 vmsumudm $out[4],$in[2],$in[2],$out[4] 358 359 xxpermdi $t2,$inx2[5],$inx2[4],0b00 360 vmsumudm $out[5],$t1,$t2,$vzero 361 vmsumudm $out[5],$in[2],$inx2[3],$out[5] 362 363 xxpermdi $t2,$inx2[6],$inx2[5],0b00 364 vmsumudm $out[6],$t1,$t2,$vzero 365 xxpermdi $t3,$in[2],$in[3],0b00 366 xxpermdi $t4,$inx2[4],$in[3],0b00 367 vmsumudm $out[6],$t3,$t4,$out[6] 368 369 xxpermdi $t2,$inx2[7],$inx2[6],0b00 370 vmsumudm $out[7],$t1,$t2,$vzero 371 xxpermdi $t4,$inx2[5],$inx2[4],0b00 372 vmsumudm $out[7],$t3,$t4,$out[7] 373 374 xxpermdi $t2,$inx2[8],$inx2[7],0b00 375 vmsumudm $out[8],$t1,$t2,$vzero 376 xxpermdi $t4,$inx2[6],$inx2[5],0b00 377 vmsumudm $out[8],$t3,$t4,$out[8] 378 vmsumudm $out[8],$in[4],$in[4],$out[8] 379 380 vmsumudm $out[1],$in[5],$inx2[5],$out[1] 381 382 vmsumudm $out[3],$in[6],$inx2[6],$out[3] 383 384 vmsumudm $out[5],$in[7],$inx2[7],$out[5] 385 386 vmsumudm $out[7],$in[8],$inx2[8],$out[7] 387 388 mtvsrdd $t1,$one,$zero 389___ 390 391 for (my $i = 5; $i <= 8; $i++) { 392 $code.=<<___; 393 vsld $inx2[$i],$inx2[$i],$t1 394___ 395 } 396 397 $code.=<<___; 398 399 vmsumudm $out[6],$in[7],$inx2[8],$out[6] 400 401 vmsumudm $out[5],$in[6],$inx2[8],$out[5] 402 403 xxpermdi $t2,$inx2[8],$inx2[7],0b00 404 xxpermdi $t1,$in[5],$in[6],0b00 405 vmsumudm $out[4],$t1,$t2,$out[4] 406 407 xxpermdi $t1,$in[4],$in[5],0b00 408 vmsumudm $out[3],$t1,$t2,$out[3] 409 410 xxpermdi $t1,$in[3],$in[4],0b00 411 vmsumudm $out[2],$t1,$t2,$out[2] 412 vmsumudm $out[2],$in[5],$inx2[6],$out[2] 413 414 xxpermdi $t1,$in[2],$in[3],0b00 415 vmsumudm $out[1],$t1,$t2,$out[1] 416 vmsumudm $out[1],$in[4],$inx2[6],$out[1] 417 418 xxpermdi $t1,$in[1],$in[2],0b00 419 vmsumudm $out[0],$t1,$t2,$out[0] 420 xxpermdi $t2,$inx2[6],$inx2[5],0b00 421 xxpermdi $t1,$in[3],$in[4],0b00 422 vmsumudm $out[0],$t1,$t2,$out[0] 423 424___ 425 426 store_vrs($outp, \@out); 427 428 pop_vrs(52, 63); 429 430 endproc("p521_felem_square"); 431 } 432} 433 434$code =~ s/\`([^\`]*)\`/eval $1/gem; 435print $code; 436close STDOUT or die "error closing STDOUT: $!"; 437