1#! /usr/bin/env perl 2# Copyright 2010-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. 13# 14# Rights for redistribution and usage in source and binary forms are 15# granted according to the License. Warranty of any kind is disclaimed. 16# ==================================================================== 17 18 19# July 1999 20# 21# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. 22# 23# The module is designed to work with either of the "new" MIPS ABI(5), 24# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under 25# IRIX 5.x not only because it doesn't support new ABIs but also 26# because 5.x kernels put R4x00 CPU into 32-bit mode and all those 27# 64-bit instructions (daddu, dmultu, etc.) found below gonna only 28# cause illegal instruction exception:-( 29# 30# In addition the code depends on preprocessor flags set up by MIPSpro 31# compiler driver (either as or cc) and therefore (probably?) can't be 32# compiled by the GNU assembler. GNU C driver manages fine though... 33# I mean as long as -mmips-as is specified or is the default option, 34# because then it simply invokes /usr/bin/as which in turn takes 35# perfect care of the preprocessor definitions. Another neat feature 36# offered by the MIPSpro assembler is an optimization pass. This gave 37# me the opportunity to have the code looking more regular as all those 38# architecture dependent instruction rescheduling details were left to 39# the assembler. Cool, huh? 40# 41# Performance improvement is astonishing! 'apps/openssl speed rsa dsa' 42# goes way over 3 times faster! 43# 44# <appro@openssl.org> 45 46# October 2010 47# 48# Adapt the module even for 32-bit ABIs and other OSes. The former was 49# achieved by mechanical replacement of 64-bit arithmetic instructions 50# such as dmultu, daddu, etc. with their 32-bit counterparts and 51# adjusting offsets denoting multiples of BN_ULONG. Above mentioned 52# >3x performance improvement naturally does not apply to 32-bit code 53# [because there is no instruction 32-bit compiler can't use], one 54# has to content with 40-85% improvement depending on benchmark and 55# key length, more for longer keys. 56 57# $output is the last argument if it looks like a file (it has an extension) 58# $flavour is the first argument if it doesn't look like a file 59$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 60$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32"; 61 62if ($flavour =~ /64|n32/i) { 63 $LD="ld"; 64 $ST="sd"; 65 $MULTU="dmultu"; 66 $DIVU="ddivu"; 67 $ADDU="daddu"; 68 $SUBU="dsubu"; 69 $SRL="dsrl"; 70 $SLL="dsll"; 71 $BNSZ=8; 72 $PTR_ADD="daddu"; 73 $PTR_SUB="dsubu"; 74 $SZREG=8; 75 $REG_S="sd"; 76 $REG_L="ld"; 77} else { 78 $LD="lw"; 79 $ST="sw"; 80 $MULTU="multu"; 81 $DIVU="divu"; 82 $ADDU="addu"; 83 $SUBU="subu"; 84 $SRL="srl"; 85 $SLL="sll"; 86 $BNSZ=4; 87 $PTR_ADD="addu"; 88 $PTR_SUB="subu"; 89 $SZREG=4; 90 $REG_S="sw"; 91 $REG_L="lw"; 92 $code="#if !(defined (__mips_isa_rev) && (__mips_isa_rev >= 6))\n.set mips2\n#endif\n"; 93} 94 95$output and open STDOUT,">$output"; 96 97# Below is N32/64 register layout used in the original module. 98# 99($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 100($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 101($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 102($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 103($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 104($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7); 105# 106# No special adaptation is required for O32. NUBI on the other hand 107# is treated by saving/restoring ($v1,$t0..$t3). 108 109$gp=$v1 if ($flavour =~ /nubi/i); 110 111$minus4=$v1; 112 113$code.=<<___; 114#include "mips_arch.h" 115 116#if defined(_MIPS_ARCH_MIPS64R6) 117# define ddivu(rs,rt) 118# define mfqt(rd,rs,rt) ddivu rd,rs,rt 119# define mfrm(rd,rs,rt) dmodu rd,rs,rt 120#elif defined(_MIPS_ARCH_MIPS32R6) 121# define divu(rs,rt) 122# define mfqt(rd,rs,rt) divu rd,rs,rt 123# define mfrm(rd,rs,rt) modu rd,rs,rt 124#else 125# define $DIVU(rs,rt) $DIVU $zero,rs,rt 126# define mfqt(rd,rs,rt) mflo rd 127# define mfrm(rd,rs,rt) mfhi rd 128#endif 129 130.rdata 131.asciiz "mips3.s, Version 1.2" 132.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>" 133 134.text 135.set noat 136 137.align 5 138.globl bn_mul_add_words 139.ent bn_mul_add_words 140bn_mul_add_words: 141 .set noreorder 142 bgtz $a2,bn_mul_add_words_internal 143 move $v0,$zero 144 jr $ra 145 move $a0,$v0 146.end bn_mul_add_words 147 148.align 5 149.ent bn_mul_add_words_internal 150bn_mul_add_words_internal: 151___ 152$code.=<<___ if ($flavour =~ /nubi/i); 153 .frame $sp,6*$SZREG,$ra 154 .mask 0x8000f008,-$SZREG 155 .set noreorder 156 $PTR_SUB $sp,6*$SZREG 157 $REG_S $ra,5*$SZREG($sp) 158 $REG_S $t3,4*$SZREG($sp) 159 $REG_S $t2,3*$SZREG($sp) 160 $REG_S $t1,2*$SZREG($sp) 161 $REG_S $t0,1*$SZREG($sp) 162 $REG_S $gp,0*$SZREG($sp) 163___ 164$code.=<<___; 165 .set reorder 166 li $minus4,-4 167 and $ta0,$a2,$minus4 168 beqz $ta0,.L_bn_mul_add_words_tail 169 170.L_bn_mul_add_words_loop: 171 $LD $t0,0($a1) 172 $MULTU ($t0,$a3) 173 $LD $t1,0($a0) 174 $LD $t2,$BNSZ($a1) 175 $LD $t3,$BNSZ($a0) 176 $LD $ta0,2*$BNSZ($a1) 177 $LD $ta1,2*$BNSZ($a0) 178 $ADDU $t1,$v0 179 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit 180 # values", but it seems to work fine 181 # even on 64-bit registers. 182 mflo ($at,$t0,$a3) 183 mfhi ($t0,$t0,$a3) 184 $ADDU $t1,$at 185 $ADDU $v0,$t0 186 $MULTU ($t2,$a3) 187 sltu $at,$t1,$at 188 $ST $t1,0($a0) 189 $ADDU $v0,$at 190 191 $LD $ta2,3*$BNSZ($a1) 192 $LD $ta3,3*$BNSZ($a0) 193 $ADDU $t3,$v0 194 sltu $v0,$t3,$v0 195 mflo ($at,$t2,$a3) 196 mfhi ($t2,$t2,$a3) 197 $ADDU $t3,$at 198 $ADDU $v0,$t2 199 $MULTU ($ta0,$a3) 200 sltu $at,$t3,$at 201 $ST $t3,$BNSZ($a0) 202 $ADDU $v0,$at 203 204 subu $a2,4 205 $PTR_ADD $a0,4*$BNSZ 206 $PTR_ADD $a1,4*$BNSZ 207 $ADDU $ta1,$v0 208 sltu $v0,$ta1,$v0 209 mflo ($at,$ta0,$a3) 210 mfhi ($ta0,$ta0,$a3) 211 $ADDU $ta1,$at 212 $ADDU $v0,$ta0 213 $MULTU ($ta2,$a3) 214 sltu $at,$ta1,$at 215 $ST $ta1,-2*$BNSZ($a0) 216 $ADDU $v0,$at 217 218 219 and $ta0,$a2,$minus4 220 $ADDU $ta3,$v0 221 sltu $v0,$ta3,$v0 222 mflo ($at,$ta2,$a3) 223 mfhi ($ta2,$ta2,$a3) 224 $ADDU $ta3,$at 225 $ADDU $v0,$ta2 226 sltu $at,$ta3,$at 227 $ST $ta3,-$BNSZ($a0) 228 .set noreorder 229 bgtz $ta0,.L_bn_mul_add_words_loop 230 $ADDU $v0,$at 231 232 beqz $a2,.L_bn_mul_add_words_return 233 nop 234 235.L_bn_mul_add_words_tail: 236 .set reorder 237 $LD $t0,0($a1) 238 $MULTU ($t0,$a3) 239 $LD $t1,0($a0) 240 subu $a2,1 241 $ADDU $t1,$v0 242 sltu $v0,$t1,$v0 243 mflo ($at,$t0,$a3) 244 mfhi ($t0,$t0,$a3) 245 $ADDU $t1,$at 246 $ADDU $v0,$t0 247 sltu $at,$t1,$at 248 $ST $t1,0($a0) 249 $ADDU $v0,$at 250 beqz $a2,.L_bn_mul_add_words_return 251 252 $LD $t0,$BNSZ($a1) 253 $MULTU ($t0,$a3) 254 $LD $t1,$BNSZ($a0) 255 subu $a2,1 256 $ADDU $t1,$v0 257 sltu $v0,$t1,$v0 258 mflo ($at,$t0,$a3) 259 mfhi ($t0,$t0,$a3) 260 $ADDU $t1,$at 261 $ADDU $v0,$t0 262 sltu $at,$t1,$at 263 $ST $t1,$BNSZ($a0) 264 $ADDU $v0,$at 265 beqz $a2,.L_bn_mul_add_words_return 266 267 $LD $t0,2*$BNSZ($a1) 268 $MULTU ($t0,$a3) 269 $LD $t1,2*$BNSZ($a0) 270 $ADDU $t1,$v0 271 sltu $v0,$t1,$v0 272 mflo ($at,$t0,$a3) 273 mfhi ($t0,$t0,$a3) 274 $ADDU $t1,$at 275 $ADDU $v0,$t0 276 sltu $at,$t1,$at 277 $ST $t1,2*$BNSZ($a0) 278 $ADDU $v0,$at 279 280.L_bn_mul_add_words_return: 281 .set noreorder 282___ 283$code.=<<___ if ($flavour =~ /nubi/i); 284 $REG_L $t3,4*$SZREG($sp) 285 $REG_L $t2,3*$SZREG($sp) 286 $REG_L $t1,2*$SZREG($sp) 287 $REG_L $t0,1*$SZREG($sp) 288 $REG_L $gp,0*$SZREG($sp) 289 $PTR_ADD $sp,6*$SZREG 290___ 291$code.=<<___; 292 jr $ra 293 move $a0,$v0 294.end bn_mul_add_words_internal 295 296.align 5 297.globl bn_mul_words 298.ent bn_mul_words 299bn_mul_words: 300 .set noreorder 301 bgtz $a2,bn_mul_words_internal 302 move $v0,$zero 303 jr $ra 304 move $a0,$v0 305.end bn_mul_words 306 307.align 5 308.ent bn_mul_words_internal 309bn_mul_words_internal: 310___ 311$code.=<<___ if ($flavour =~ /nubi/i); 312 .frame $sp,6*$SZREG,$ra 313 .mask 0x8000f008,-$SZREG 314 .set noreorder 315 $PTR_SUB $sp,6*$SZREG 316 $REG_S $ra,5*$SZREG($sp) 317 $REG_S $t3,4*$SZREG($sp) 318 $REG_S $t2,3*$SZREG($sp) 319 $REG_S $t1,2*$SZREG($sp) 320 $REG_S $t0,1*$SZREG($sp) 321 $REG_S $gp,0*$SZREG($sp) 322___ 323$code.=<<___; 324 .set reorder 325 li $minus4,-4 326 and $ta0,$a2,$minus4 327 beqz $ta0,.L_bn_mul_words_tail 328 329.L_bn_mul_words_loop: 330 $LD $t0,0($a1) 331 $MULTU ($t0,$a3) 332 $LD $t2,$BNSZ($a1) 333 $LD $ta0,2*$BNSZ($a1) 334 $LD $ta2,3*$BNSZ($a1) 335 mflo ($at,$t0,$a3) 336 mfhi ($t0,$t0,$a3) 337 $ADDU $v0,$at 338 sltu $t1,$v0,$at 339 $MULTU ($t2,$a3) 340 $ST $v0,0($a0) 341 $ADDU $v0,$t1,$t0 342 343 subu $a2,4 344 $PTR_ADD $a0,4*$BNSZ 345 $PTR_ADD $a1,4*$BNSZ 346 mflo ($at,$t2,$a3) 347 mfhi ($t2,$t2,$a3) 348 $ADDU $v0,$at 349 sltu $t3,$v0,$at 350 $MULTU ($ta0,$a3) 351 $ST $v0,-3*$BNSZ($a0) 352 $ADDU $v0,$t3,$t2 353 354 mflo ($at,$ta0,$a3) 355 mfhi ($ta0,$ta0,$a3) 356 $ADDU $v0,$at 357 sltu $ta1,$v0,$at 358 $MULTU ($ta2,$a3) 359 $ST $v0,-2*$BNSZ($a0) 360 $ADDU $v0,$ta1,$ta0 361 362 and $ta0,$a2,$minus4 363 mflo ($at,$ta2,$a3) 364 mfhi ($ta2,$ta2,$a3) 365 $ADDU $v0,$at 366 sltu $ta3,$v0,$at 367 $ST $v0,-$BNSZ($a0) 368 .set noreorder 369 bgtz $ta0,.L_bn_mul_words_loop 370 $ADDU $v0,$ta3,$ta2 371 372 beqz $a2,.L_bn_mul_words_return 373 nop 374 375.L_bn_mul_words_tail: 376 .set reorder 377 $LD $t0,0($a1) 378 $MULTU ($t0,$a3) 379 subu $a2,1 380 mflo ($at,$t0,$a3) 381 mfhi ($t0,$t0,$a3) 382 $ADDU $v0,$at 383 sltu $t1,$v0,$at 384 $ST $v0,0($a0) 385 $ADDU $v0,$t1,$t0 386 beqz $a2,.L_bn_mul_words_return 387 388 $LD $t0,$BNSZ($a1) 389 $MULTU ($t0,$a3) 390 subu $a2,1 391 mflo ($at,$t0,$a3) 392 mfhi ($t0,$t0,$a3) 393 $ADDU $v0,$at 394 sltu $t1,$v0,$at 395 $ST $v0,$BNSZ($a0) 396 $ADDU $v0,$t1,$t0 397 beqz $a2,.L_bn_mul_words_return 398 399 $LD $t0,2*$BNSZ($a1) 400 $MULTU ($t0,$a3) 401 mflo ($at,$t0,$a3) 402 mfhi ($t0,$t0,$a3) 403 $ADDU $v0,$at 404 sltu $t1,$v0,$at 405 $ST $v0,2*$BNSZ($a0) 406 $ADDU $v0,$t1,$t0 407 408.L_bn_mul_words_return: 409 .set noreorder 410___ 411$code.=<<___ if ($flavour =~ /nubi/i); 412 $REG_L $t3,4*$SZREG($sp) 413 $REG_L $t2,3*$SZREG($sp) 414 $REG_L $t1,2*$SZREG($sp) 415 $REG_L $t0,1*$SZREG($sp) 416 $REG_L $gp,0*$SZREG($sp) 417 $PTR_ADD $sp,6*$SZREG 418___ 419$code.=<<___; 420 jr $ra 421 move $a0,$v0 422.end bn_mul_words_internal 423 424.align 5 425.globl bn_sqr_words 426.ent bn_sqr_words 427bn_sqr_words: 428 .set noreorder 429 bgtz $a2,bn_sqr_words_internal 430 move $v0,$zero 431 jr $ra 432 move $a0,$v0 433.end bn_sqr_words 434 435.align 5 436.ent bn_sqr_words_internal 437bn_sqr_words_internal: 438___ 439$code.=<<___ if ($flavour =~ /nubi/i); 440 .frame $sp,6*$SZREG,$ra 441 .mask 0x8000f008,-$SZREG 442 .set noreorder 443 $PTR_SUB $sp,6*$SZREG 444 $REG_S $ra,5*$SZREG($sp) 445 $REG_S $t3,4*$SZREG($sp) 446 $REG_S $t2,3*$SZREG($sp) 447 $REG_S $t1,2*$SZREG($sp) 448 $REG_S $t0,1*$SZREG($sp) 449 $REG_S $gp,0*$SZREG($sp) 450___ 451$code.=<<___; 452 .set reorder 453 li $minus4,-4 454 and $ta0,$a2,$minus4 455 beqz $ta0,.L_bn_sqr_words_tail 456 457.L_bn_sqr_words_loop: 458 $LD $t0,0($a1) 459 $MULTU ($t0,$t0) 460 $LD $t2,$BNSZ($a1) 461 $LD $ta0,2*$BNSZ($a1) 462 $LD $ta2,3*$BNSZ($a1) 463 mflo ($t1,$t0,$t0) 464 mfhi ($t0,$t0,$t0) 465 $ST $t1,0($a0) 466 $ST $t0,$BNSZ($a0) 467 468 $MULTU ($t2,$t2) 469 subu $a2,4 470 $PTR_ADD $a0,8*$BNSZ 471 $PTR_ADD $a1,4*$BNSZ 472 mflo ($t3,$t2,$t2) 473 mfhi ($t2,$t2,$t2) 474 $ST $t3,-6*$BNSZ($a0) 475 $ST $t2,-5*$BNSZ($a0) 476 477 $MULTU ($ta0,$ta0) 478 mflo ($ta1,$ta0,$ta0) 479 mfhi ($ta0,$ta0,$ta0) 480 $ST $ta1,-4*$BNSZ($a0) 481 $ST $ta0,-3*$BNSZ($a0) 482 483 484 $MULTU ($ta2,$ta2) 485 and $ta0,$a2,$minus4 486 mflo ($ta3,$ta2,$ta2) 487 mfhi ($ta2,$ta2,$ta2) 488 $ST $ta3,-2*$BNSZ($a0) 489 490 .set noreorder 491 bgtz $ta0,.L_bn_sqr_words_loop 492 $ST $ta2,-$BNSZ($a0) 493 494 beqz $a2,.L_bn_sqr_words_return 495 nop 496 497.L_bn_sqr_words_tail: 498 .set reorder 499 $LD $t0,0($a1) 500 $MULTU ($t0,$t0) 501 subu $a2,1 502 mflo ($t1,$t0,$t0) 503 mfhi ($t0,$t0,$t0) 504 $ST $t1,0($a0) 505 $ST $t0,$BNSZ($a0) 506 beqz $a2,.L_bn_sqr_words_return 507 508 $LD $t0,$BNSZ($a1) 509 $MULTU ($t0,$t0) 510 subu $a2,1 511 mflo ($t1,$t0,$t0) 512 mfhi ($t0,$t0,$t0) 513 $ST $t1,2*$BNSZ($a0) 514 $ST $t0,3*$BNSZ($a0) 515 beqz $a2,.L_bn_sqr_words_return 516 517 $LD $t0,2*$BNSZ($a1) 518 $MULTU ($t0,$t0) 519 mflo ($t1,$t0,$t0) 520 mfhi ($t0,$t0,$t0) 521 $ST $t1,4*$BNSZ($a0) 522 $ST $t0,5*$BNSZ($a0) 523 524.L_bn_sqr_words_return: 525 .set noreorder 526___ 527$code.=<<___ if ($flavour =~ /nubi/i); 528 $REG_L $t3,4*$SZREG($sp) 529 $REG_L $t2,3*$SZREG($sp) 530 $REG_L $t1,2*$SZREG($sp) 531 $REG_L $t0,1*$SZREG($sp) 532 $REG_L $gp,0*$SZREG($sp) 533 $PTR_ADD $sp,6*$SZREG 534___ 535$code.=<<___; 536 jr $ra 537 move $a0,$v0 538 539.end bn_sqr_words_internal 540 541.align 5 542.globl bn_add_words 543.ent bn_add_words 544bn_add_words: 545 .set noreorder 546 bgtz $a3,bn_add_words_internal 547 move $v0,$zero 548 jr $ra 549 move $a0,$v0 550.end bn_add_words 551 552.align 5 553.ent bn_add_words_internal 554bn_add_words_internal: 555___ 556$code.=<<___ if ($flavour =~ /nubi/i); 557 .frame $sp,6*$SZREG,$ra 558 .mask 0x8000f008,-$SZREG 559 .set noreorder 560 $PTR_SUB $sp,6*$SZREG 561 $REG_S $ra,5*$SZREG($sp) 562 $REG_S $t3,4*$SZREG($sp) 563 $REG_S $t2,3*$SZREG($sp) 564 $REG_S $t1,2*$SZREG($sp) 565 $REG_S $t0,1*$SZREG($sp) 566 $REG_S $gp,0*$SZREG($sp) 567___ 568$code.=<<___; 569 .set reorder 570 li $minus4,-4 571 and $at,$a3,$minus4 572 beqz $at,.L_bn_add_words_tail 573 574.L_bn_add_words_loop: 575 $LD $t0,0($a1) 576 $LD $ta0,0($a2) 577 subu $a3,4 578 $LD $t1,$BNSZ($a1) 579 and $at,$a3,$minus4 580 $LD $t2,2*$BNSZ($a1) 581 $PTR_ADD $a2,4*$BNSZ 582 $LD $t3,3*$BNSZ($a1) 583 $PTR_ADD $a0,4*$BNSZ 584 $LD $ta1,-3*$BNSZ($a2) 585 $PTR_ADD $a1,4*$BNSZ 586 $LD $ta2,-2*$BNSZ($a2) 587 $LD $ta3,-$BNSZ($a2) 588 $ADDU $ta0,$t0 589 sltu $t8,$ta0,$t0 590 $ADDU $t0,$ta0,$v0 591 sltu $v0,$t0,$ta0 592 $ST $t0,-4*$BNSZ($a0) 593 $ADDU $v0,$t8 594 595 $ADDU $ta1,$t1 596 sltu $t9,$ta1,$t1 597 $ADDU $t1,$ta1,$v0 598 sltu $v0,$t1,$ta1 599 $ST $t1,-3*$BNSZ($a0) 600 $ADDU $v0,$t9 601 602 $ADDU $ta2,$t2 603 sltu $t8,$ta2,$t2 604 $ADDU $t2,$ta2,$v0 605 sltu $v0,$t2,$ta2 606 $ST $t2,-2*$BNSZ($a0) 607 $ADDU $v0,$t8 608 609 $ADDU $ta3,$t3 610 sltu $t9,$ta3,$t3 611 $ADDU $t3,$ta3,$v0 612 sltu $v0,$t3,$ta3 613 $ST $t3,-$BNSZ($a0) 614 615 .set noreorder 616 bgtz $at,.L_bn_add_words_loop 617 $ADDU $v0,$t9 618 619 beqz $a3,.L_bn_add_words_return 620 nop 621 622.L_bn_add_words_tail: 623 .set reorder 624 $LD $t0,0($a1) 625 $LD $ta0,0($a2) 626 $ADDU $ta0,$t0 627 subu $a3,1 628 sltu $t8,$ta0,$t0 629 $ADDU $t0,$ta0,$v0 630 sltu $v0,$t0,$ta0 631 $ST $t0,0($a0) 632 $ADDU $v0,$t8 633 beqz $a3,.L_bn_add_words_return 634 635 $LD $t1,$BNSZ($a1) 636 $LD $ta1,$BNSZ($a2) 637 $ADDU $ta1,$t1 638 subu $a3,1 639 sltu $t9,$ta1,$t1 640 $ADDU $t1,$ta1,$v0 641 sltu $v0,$t1,$ta1 642 $ST $t1,$BNSZ($a0) 643 $ADDU $v0,$t9 644 beqz $a3,.L_bn_add_words_return 645 646 $LD $t2,2*$BNSZ($a1) 647 $LD $ta2,2*$BNSZ($a2) 648 $ADDU $ta2,$t2 649 sltu $t8,$ta2,$t2 650 $ADDU $t2,$ta2,$v0 651 sltu $v0,$t2,$ta2 652 $ST $t2,2*$BNSZ($a0) 653 $ADDU $v0,$t8 654 655.L_bn_add_words_return: 656 .set noreorder 657___ 658$code.=<<___ if ($flavour =~ /nubi/i); 659 $REG_L $t3,4*$SZREG($sp) 660 $REG_L $t2,3*$SZREG($sp) 661 $REG_L $t1,2*$SZREG($sp) 662 $REG_L $t0,1*$SZREG($sp) 663 $REG_L $gp,0*$SZREG($sp) 664 $PTR_ADD $sp,6*$SZREG 665___ 666$code.=<<___; 667 jr $ra 668 move $a0,$v0 669 670.end bn_add_words_internal 671 672.align 5 673.globl bn_sub_words 674.ent bn_sub_words 675bn_sub_words: 676 .set noreorder 677 bgtz $a3,bn_sub_words_internal 678 move $v0,$zero 679 jr $ra 680 move $a0,$zero 681.end bn_sub_words 682 683.align 5 684.ent bn_sub_words_internal 685bn_sub_words_internal: 686___ 687$code.=<<___ if ($flavour =~ /nubi/i); 688 .frame $sp,6*$SZREG,$ra 689 .mask 0x8000f008,-$SZREG 690 .set noreorder 691 $PTR_SUB $sp,6*$SZREG 692 $REG_S $ra,5*$SZREG($sp) 693 $REG_S $t3,4*$SZREG($sp) 694 $REG_S $t2,3*$SZREG($sp) 695 $REG_S $t1,2*$SZREG($sp) 696 $REG_S $t0,1*$SZREG($sp) 697 $REG_S $gp,0*$SZREG($sp) 698___ 699$code.=<<___; 700 .set reorder 701 li $minus4,-4 702 and $at,$a3,$minus4 703 beqz $at,.L_bn_sub_words_tail 704 705.L_bn_sub_words_loop: 706 $LD $t0,0($a1) 707 $LD $ta0,0($a2) 708 subu $a3,4 709 $LD $t1,$BNSZ($a1) 710 and $at,$a3,$minus4 711 $LD $t2,2*$BNSZ($a1) 712 $PTR_ADD $a2,4*$BNSZ 713 $LD $t3,3*$BNSZ($a1) 714 $PTR_ADD $a0,4*$BNSZ 715 $LD $ta1,-3*$BNSZ($a2) 716 $PTR_ADD $a1,4*$BNSZ 717 $LD $ta2,-2*$BNSZ($a2) 718 $LD $ta3,-$BNSZ($a2) 719 sltu $t8,$t0,$ta0 720 $SUBU $ta0,$t0,$ta0 721 $SUBU $t0,$ta0,$v0 722 sgtu $v0,$t0,$ta0 723 $ST $t0,-4*$BNSZ($a0) 724 $ADDU $v0,$t8 725 726 sltu $t9,$t1,$ta1 727 $SUBU $ta1,$t1,$ta1 728 $SUBU $t1,$ta1,$v0 729 sgtu $v0,$t1,$ta1 730 $ST $t1,-3*$BNSZ($a0) 731 $ADDU $v0,$t9 732 733 734 sltu $t8,$t2,$ta2 735 $SUBU $ta2,$t2,$ta2 736 $SUBU $t2,$ta2,$v0 737 sgtu $v0,$t2,$ta2 738 $ST $t2,-2*$BNSZ($a0) 739 $ADDU $v0,$t8 740 741 sltu $t9,$t3,$ta3 742 $SUBU $ta3,$t3,$ta3 743 $SUBU $t3,$ta3,$v0 744 sgtu $v0,$t3,$ta3 745 $ST $t3,-$BNSZ($a0) 746 747 .set noreorder 748 bgtz $at,.L_bn_sub_words_loop 749 $ADDU $v0,$t9 750 751 beqz $a3,.L_bn_sub_words_return 752 nop 753 754.L_bn_sub_words_tail: 755 .set reorder 756 $LD $t0,0($a1) 757 $LD $ta0,0($a2) 758 subu $a3,1 759 sltu $t8,$t0,$ta0 760 $SUBU $ta0,$t0,$ta0 761 $SUBU $t0,$ta0,$v0 762 sgtu $v0,$t0,$ta0 763 $ST $t0,0($a0) 764 $ADDU $v0,$t8 765 beqz $a3,.L_bn_sub_words_return 766 767 $LD $t1,$BNSZ($a1) 768 subu $a3,1 769 $LD $ta1,$BNSZ($a2) 770 sltu $t9,$t1,$ta1 771 $SUBU $ta1,$t1,$ta1 772 $SUBU $t1,$ta1,$v0 773 sgtu $v0,$t1,$ta1 774 $ST $t1,$BNSZ($a0) 775 $ADDU $v0,$t9 776 beqz $a3,.L_bn_sub_words_return 777 778 $LD $t2,2*$BNSZ($a1) 779 $LD $ta2,2*$BNSZ($a2) 780 sltu $t8,$t2,$ta2 781 $SUBU $ta2,$t2,$ta2 782 $SUBU $t2,$ta2,$v0 783 sgtu $v0,$t2,$ta2 784 $ST $t2,2*$BNSZ($a0) 785 $ADDU $v0,$t8 786 787.L_bn_sub_words_return: 788 .set noreorder 789___ 790$code.=<<___ if ($flavour =~ /nubi/i); 791 $REG_L $t3,4*$SZREG($sp) 792 $REG_L $t2,3*$SZREG($sp) 793 $REG_L $t1,2*$SZREG($sp) 794 $REG_L $t0,1*$SZREG($sp) 795 $REG_L $gp,0*$SZREG($sp) 796 $PTR_ADD $sp,6*$SZREG 797___ 798$code.=<<___; 799 jr $ra 800 move $a0,$v0 801.end bn_sub_words_internal 802 803#if 0 804/* 805 * The bn_div_3_words entry point is reused for constant-time interface. 806 * Implementation is retained as historical reference. 807 */ 808.align 5 809.globl bn_div_3_words 810.ent bn_div_3_words 811bn_div_3_words: 812 .set noreorder 813 move $a3,$a0 # we know that bn_div_words does not 814 # touch $a3, $ta2, $ta3 and preserves $a2 815 # so that we can save two arguments 816 # and return address in registers 817 # instead of stack:-) 818 819 $LD $a0,($a3) 820 move $ta2,$a1 821 bne $a0,$a2,bn_div_3_words_internal 822 $LD $a1,-$BNSZ($a3) 823 li $v0,-1 824 jr $ra 825 move $a0,$v0 826.end bn_div_3_words 827 828.align 5 829.ent bn_div_3_words_internal 830bn_div_3_words_internal: 831___ 832$code.=<<___ if ($flavour =~ /nubi/i); 833 .frame $sp,6*$SZREG,$ra 834 .mask 0x8000f008,-$SZREG 835 .set noreorder 836 $PTR_SUB $sp,6*$SZREG 837 $REG_S $ra,5*$SZREG($sp) 838 $REG_S $t3,4*$SZREG($sp) 839 $REG_S $t2,3*$SZREG($sp) 840 $REG_S $t1,2*$SZREG($sp) 841 $REG_S $t0,1*$SZREG($sp) 842 $REG_S $gp,0*$SZREG($sp) 843___ 844$code.=<<___; 845 .set reorder 846 move $ta3,$ra 847 bal bn_div_words_internal 848 move $ra,$ta3 849 $MULTU ($ta2,$v0) 850 $LD $t2,-2*$BNSZ($a3) 851 move $ta0,$zero 852 mfhi ($t1,$ta2,$v0) 853 mflo ($t0,$ta2,$v0) 854 sltu $t8,$t1,$a1 855.L_bn_div_3_words_inner_loop: 856 bnez $t8,.L_bn_div_3_words_inner_loop_done 857 sgeu $at,$t2,$t0 858 seq $t9,$t1,$a1 859 and $at,$t9 860 sltu $t3,$t0,$ta2 861 $ADDU $a1,$a2 862 $SUBU $t1,$t3 863 $SUBU $t0,$ta2 864 sltu $t8,$t1,$a1 865 sltu $ta0,$a1,$a2 866 or $t8,$ta0 867 .set noreorder 868 beqz $at,.L_bn_div_3_words_inner_loop 869 $SUBU $v0,1 870 $ADDU $v0,1 871 .set reorder 872.L_bn_div_3_words_inner_loop_done: 873 .set noreorder 874___ 875$code.=<<___ if ($flavour =~ /nubi/i); 876 $REG_L $t3,4*$SZREG($sp) 877 $REG_L $t2,3*$SZREG($sp) 878 $REG_L $t1,2*$SZREG($sp) 879 $REG_L $t0,1*$SZREG($sp) 880 $REG_L $gp,0*$SZREG($sp) 881 $PTR_ADD $sp,6*$SZREG 882___ 883$code.=<<___; 884 jr $ra 885 move $a0,$v0 886.end bn_div_3_words_internal 887#endif 888 889.align 5 890.globl bn_div_words 891.ent bn_div_words 892bn_div_words: 893 .set noreorder 894 bnez $a2,bn_div_words_internal 895 li $v0,-1 # I would rather signal div-by-zero 896 # which can be done with 'break 7' 897 jr $ra 898 move $a0,$v0 899.end bn_div_words 900 901.align 5 902.ent bn_div_words_internal 903bn_div_words_internal: 904___ 905$code.=<<___ if ($flavour =~ /nubi/i); 906 .frame $sp,6*$SZREG,$ra 907 .mask 0x8000f008,-$SZREG 908 .set noreorder 909 $PTR_SUB $sp,6*$SZREG 910 $REG_S $ra,5*$SZREG($sp) 911 $REG_S $t3,4*$SZREG($sp) 912 $REG_S $t2,3*$SZREG($sp) 913 $REG_S $t1,2*$SZREG($sp) 914 $REG_S $t0,1*$SZREG($sp) 915 $REG_S $gp,0*$SZREG($sp) 916___ 917$code.=<<___; 918 move $v1,$zero 919 bltz $a2,.L_bn_div_words_body 920 move $t9,$v1 921 $SLL $a2,1 922 bgtz $a2,.-4 923 addu $t9,1 924 925 .set reorder 926 negu $t1,$t9 927 li $t2,-1 928 $SLL $t2,$t1 929 and $t2,$a0 930 $SRL $at,$a1,$t1 931 .set noreorder 932 beqz $t2,.+12 933 nop 934 break 6 # signal overflow 935 .set reorder 936 $SLL $a0,$t9 937 $SLL $a1,$t9 938 or $a0,$at 939___ 940$QT=$ta0; 941$HH=$ta1; 942$DH=$v1; 943$code.=<<___; 944.L_bn_div_words_body: 945 $SRL $DH,$a2,4*$BNSZ # bits 946 sgeu $at,$a0,$a2 947 .set noreorder 948 beqz $at,.+12 949 nop 950 $SUBU $a0,$a2 951 .set reorder 952 953 li $QT,-1 954 $SRL $HH,$a0,4*$BNSZ # bits 955 $SRL $QT,4*$BNSZ # q=0xffffffff 956 beq $DH,$HH,.L_bn_div_words_skip_div1 957 $DIVU ($a0,$DH) 958 mfqt ($QT,$a0,$DH) 959.L_bn_div_words_skip_div1: 960 $MULTU ($a2,$QT) 961 $SLL $t3,$a0,4*$BNSZ # bits 962 $SRL $at,$a1,4*$BNSZ # bits 963 or $t3,$at 964 mflo ($t0,$a2,$QT) 965 mfhi ($t1,$a2,$QT) 966.L_bn_div_words_inner_loop1: 967 sltu $t2,$t3,$t0 968 seq $t8,$HH,$t1 969 sltu $at,$HH,$t1 970 and $t2,$t8 971 sltu $v0,$t0,$a2 972 or $at,$t2 973 .set noreorder 974 beqz $at,.L_bn_div_words_inner_loop1_done 975 $SUBU $t1,$v0 976 $SUBU $t0,$a2 977 b .L_bn_div_words_inner_loop1 978 $SUBU $QT,1 979 .set reorder 980.L_bn_div_words_inner_loop1_done: 981 982 $SLL $a1,4*$BNSZ # bits 983 $SUBU $a0,$t3,$t0 984 $SLL $v0,$QT,4*$BNSZ # bits 985 986 li $QT,-1 987 $SRL $HH,$a0,4*$BNSZ # bits 988 $SRL $QT,4*$BNSZ # q=0xffffffff 989 beq $DH,$HH,.L_bn_div_words_skip_div2 990 $DIVU ($a0,$DH) 991 mfqt ($QT,$a0,$DH) 992.L_bn_div_words_skip_div2: 993 $MULTU ($a2,$QT) 994 $SLL $t3,$a0,4*$BNSZ # bits 995 $SRL $at,$a1,4*$BNSZ # bits 996 or $t3,$at 997 mflo ($t0,$a2,$QT) 998 mfhi ($t1,$a2,$QT) 999.L_bn_div_words_inner_loop2: 1000 sltu $t2,$t3,$t0 1001 seq $t8,$HH,$t1 1002 sltu $at,$HH,$t1 1003 and $t2,$t8 1004 sltu $v1,$t0,$a2 1005 or $at,$t2 1006 .set noreorder 1007 beqz $at,.L_bn_div_words_inner_loop2_done 1008 $SUBU $t1,$v1 1009 $SUBU $t0,$a2 1010 b .L_bn_div_words_inner_loop2 1011 $SUBU $QT,1 1012 .set reorder 1013.L_bn_div_words_inner_loop2_done: 1014 1015 $SUBU $a0,$t3,$t0 1016 or $v0,$QT 1017 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it 1018 $SRL $a2,$t9 # restore $a2 1019 1020 .set noreorder 1021 move $a1,$v1 1022___ 1023$code.=<<___ if ($flavour =~ /nubi/i); 1024 $REG_L $t3,4*$SZREG($sp) 1025 $REG_L $t2,3*$SZREG($sp) 1026 $REG_L $t1,2*$SZREG($sp) 1027 $REG_L $t0,1*$SZREG($sp) 1028 $REG_L $gp,0*$SZREG($sp) 1029 $PTR_ADD $sp,6*$SZREG 1030___ 1031$code.=<<___; 1032 jr $ra 1033 move $a0,$v0 1034.end bn_div_words_internal 1035___ 1036undef $HH; undef $QT; undef $DH; 1037 1038($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3); 1039($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3); 1040 1041($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1 1042($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2 1043 1044($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3); 1045 1046$code.=<<___; 1047 1048.align 5 1049.globl bn_mul_comba8 1050.ent bn_mul_comba8 1051bn_mul_comba8: 1052 .set noreorder 1053___ 1054$code.=<<___ if ($flavour =~ /nubi/i); 1055 .frame $sp,12*$SZREG,$ra 1056 .mask 0x803ff008,-$SZREG 1057 $PTR_SUB $sp,12*$SZREG 1058 $REG_S $ra,11*$SZREG($sp) 1059 $REG_S $s5,10*$SZREG($sp) 1060 $REG_S $s4,9*$SZREG($sp) 1061 $REG_S $s3,8*$SZREG($sp) 1062 $REG_S $s2,7*$SZREG($sp) 1063 $REG_S $s1,6*$SZREG($sp) 1064 $REG_S $s0,5*$SZREG($sp) 1065 $REG_S $t3,4*$SZREG($sp) 1066 $REG_S $t2,3*$SZREG($sp) 1067 $REG_S $t1,2*$SZREG($sp) 1068 $REG_S $t0,1*$SZREG($sp) 1069 $REG_S $gp,0*$SZREG($sp) 1070___ 1071$code.=<<___ if ($flavour !~ /nubi/i); 1072 .frame $sp,6*$SZREG,$ra 1073 .mask 0x003f0000,-$SZREG 1074 $PTR_SUB $sp,6*$SZREG 1075 $REG_S $s5,5*$SZREG($sp) 1076 $REG_S $s4,4*$SZREG($sp) 1077 $REG_S $s3,3*$SZREG($sp) 1078 $REG_S $s2,2*$SZREG($sp) 1079 $REG_S $s1,1*$SZREG($sp) 1080 $REG_S $s0,0*$SZREG($sp) 1081___ 1082$code.=<<___; 1083 1084 .set reorder 1085 $LD $a_0,0($a1) # If compiled with -mips3 option on 1086 # R5000 box assembler barks on this 1087 # 1ine with "should not have mult/div 1088 # as last instruction in bb (R10K 1089 # bug)" warning. If anybody out there 1090 # has a clue about how to circumvent 1091 # this do send me a note. 1092 # <appro\@fy.chalmers.se> 1093 1094 $LD $b_0,0($a2) 1095 $LD $a_1,$BNSZ($a1) 1096 $LD $a_2,2*$BNSZ($a1) 1097 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3); 1098 $LD $a_3,3*$BNSZ($a1) 1099 $LD $b_1,$BNSZ($a2) 1100 $LD $b_2,2*$BNSZ($a2) 1101 $LD $b_3,3*$BNSZ($a2) 1102 mflo ($c_1,$a_0,$b_0) 1103 mfhi ($c_2,$a_0,$b_0) 1104 1105 $LD $a_4,4*$BNSZ($a1) 1106 $LD $a_5,5*$BNSZ($a1) 1107 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1); 1108 $LD $a_6,6*$BNSZ($a1) 1109 $LD $a_7,7*$BNSZ($a1) 1110 $LD $b_4,4*$BNSZ($a2) 1111 $LD $b_5,5*$BNSZ($a2) 1112 mflo ($t_1,$a_0,$b_1) 1113 mfhi ($t_2,$a_0,$b_1) 1114 $ADDU $c_2,$t_1 1115 sltu $at,$c_2,$t_1 1116 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1); 1117 $ADDU $c_3,$t_2,$at 1118 $LD $b_6,6*$BNSZ($a2) 1119 $LD $b_7,7*$BNSZ($a2) 1120 $ST $c_1,0($a0) # r[0]=c1; 1121 mflo ($t_1,$a_1,$b_0) 1122 mfhi ($t_2,$a_1,$b_0) 1123 $ADDU $c_2,$t_1 1124 sltu $at,$c_2,$t_1 1125 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2); 1126 $ADDU $t_2,$at 1127 $ADDU $c_3,$t_2 1128 sltu $c_1,$c_3,$t_2 1129 $ST $c_2,$BNSZ($a0) # r[1]=c2; 1130 1131 mflo ($t_1,$a_2,$b_0) 1132 mfhi ($t_2,$a_2,$b_0) 1133 $ADDU $c_3,$t_1 1134 sltu $at,$c_3,$t_1 1135 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2); 1136 $ADDU $t_2,$at 1137 $ADDU $c_1,$t_2 1138 mflo ($t_1,$a_1,$b_1) 1139 mfhi ($t_2,$a_1,$b_1) 1140 $ADDU $c_3,$t_1 1141 sltu $at,$c_3,$t_1 1142 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2); 1143 $ADDU $t_2,$at 1144 $ADDU $c_1,$t_2 1145 sltu $c_2,$c_1,$t_2 1146 mflo ($t_1,$a_0,$b_2) 1147 mfhi ($t_2,$a_0,$b_2) 1148 $ADDU $c_3,$t_1 1149 sltu $at,$c_3,$t_1 1150 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3); 1151 $ADDU $t_2,$at 1152 $ADDU $c_1,$t_2 1153 sltu $at,$c_1,$t_2 1154 $ADDU $c_2,$at 1155 $ST $c_3,2*$BNSZ($a0) # r[2]=c3; 1156 1157 mflo ($t_1,$a_0,$b_3) 1158 mfhi ($t_2,$a_0,$b_3) 1159 $ADDU $c_1,$t_1 1160 sltu $at,$c_1,$t_1 1161 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3); 1162 $ADDU $t_2,$at 1163 $ADDU $c_2,$t_2 1164 sltu $c_3,$c_2,$t_2 1165 mflo ($t_1,$a_1,$b_2) 1166 mfhi ($t_2,$a_1,$b_2) 1167 $ADDU $c_1,$t_1 1168 sltu $at,$c_1,$t_1 1169 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3); 1170 $ADDU $t_2,$at 1171 $ADDU $c_2,$t_2 1172 sltu $at,$c_2,$t_2 1173 $ADDU $c_3,$at 1174 mflo ($t_1,$a_2,$b_1) 1175 mfhi ($t_2,$a_2,$b_1) 1176 $ADDU $c_1,$t_1 1177 sltu $at,$c_1,$t_1 1178 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3); 1179 $ADDU $t_2,$at 1180 $ADDU $c_2,$t_2 1181 sltu $at,$c_2,$t_2 1182 $ADDU $c_3,$at 1183 mflo ($t_1,$a_3,$b_0) 1184 mfhi ($t_2,$a_3,$b_0) 1185 $ADDU $c_1,$t_1 1186 sltu $at,$c_1,$t_1 1187 $MULTU ($a_4,$b_0) # mul_add_c(a[4],b[0],c2,c3,c1); 1188 $ADDU $t_2,$at 1189 $ADDU $c_2,$t_2 1190 sltu $at,$c_2,$t_2 1191 $ADDU $c_3,$at 1192 $ST $c_1,3*$BNSZ($a0) # r[3]=c1; 1193 1194 mflo ($t_1,$a_4,$b_0) 1195 mfhi ($t_2,$a_4,$b_0) 1196 $ADDU $c_2,$t_1 1197 sltu $at,$c_2,$t_1 1198 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1); 1199 $ADDU $t_2,$at 1200 $ADDU $c_3,$t_2 1201 sltu $c_1,$c_3,$t_2 1202 mflo ($t_1,$a_3,$b_1) 1203 mfhi ($t_2,$a_3,$b_1) 1204 $ADDU $c_2,$t_1 1205 sltu $at,$c_2,$t_1 1206 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1); 1207 $ADDU $t_2,$at 1208 $ADDU $c_3,$t_2 1209 sltu $at,$c_3,$t_2 1210 $ADDU $c_1,$at 1211 mflo ($t_1,$a_2,$b_2) 1212 mfhi ($t_2,$a_2,$b_2) 1213 $ADDU $c_2,$t_1 1214 sltu $at,$c_2,$t_1 1215 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1); 1216 $ADDU $t_2,$at 1217 $ADDU $c_3,$t_2 1218 sltu $at,$c_3,$t_2 1219 $ADDU $c_1,$at 1220 mflo ($t_1,$a_1,$b_3) 1221 mfhi ($t_2,$a_1,$b_3) 1222 $ADDU $c_2,$t_1 1223 sltu $at,$c_2,$t_1 1224 $MULTU ($a_0,$b_4) # mul_add_c(a[0],b[4],c2,c3,c1); 1225 $ADDU $t_2,$at 1226 $ADDU $c_3,$t_2 1227 sltu $at,$c_3,$t_2 1228 $ADDU $c_1,$at 1229 mflo ($t_1,$a_0,$b_4) 1230 mfhi ($t_2,$a_0,$b_4) 1231 $ADDU $c_2,$t_1 1232 sltu $at,$c_2,$t_1 1233 $MULTU ($a_0,$b_5) # mul_add_c(a[0],b[5],c3,c1,c2); 1234 $ADDU $t_2,$at 1235 $ADDU $c_3,$t_2 1236 sltu $at,$c_3,$t_2 1237 $ADDU $c_1,$at 1238 $ST $c_2,4*$BNSZ($a0) # r[4]=c2; 1239 1240 mflo ($t_1,$a_0,$b_5) 1241 mfhi ($t_2,$a_0,$b_5) 1242 $ADDU $c_3,$t_1 1243 sltu $at,$c_3,$t_1 1244 $MULTU ($a_1,$b_4) # mul_add_c(a[1],b[4],c3,c1,c2); 1245 $ADDU $t_2,$at 1246 $ADDU $c_1,$t_2 1247 sltu $c_2,$c_1,$t_2 1248 mflo ($t_1,$a_1,$b_4) 1249 mfhi ($t_2,$a_1,$b_4) 1250 $ADDU $c_3,$t_1 1251 sltu $at,$c_3,$t_1 1252 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2); 1253 $ADDU $t_2,$at 1254 $ADDU $c_1,$t_2 1255 sltu $at,$c_1,$t_2 1256 $ADDU $c_2,$at 1257 mflo ($t_1,$a_2,$b_3) 1258 mfhi ($t_2,$a_2,$b_3) 1259 $ADDU $c_3,$t_1 1260 sltu $at,$c_3,$t_1 1261 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2); 1262 $ADDU $t_2,$at 1263 $ADDU $c_1,$t_2 1264 sltu $at,$c_1,$t_2 1265 $ADDU $c_2,$at 1266 mflo ($t_1,$a_3,$b_2) 1267 mfhi ($t_2,$a_3,$b_2) 1268 $ADDU $c_3,$t_1 1269 sltu $at,$c_3,$t_1 1270 $MULTU ($a_4,$b_1) # mul_add_c(a[4],b[1],c3,c1,c2); 1271 $ADDU $t_2,$at 1272 $ADDU $c_1,$t_2 1273 sltu $at,$c_1,$t_2 1274 $ADDU $c_2,$at 1275 mflo ($t_1,$a_4,$b_1) 1276 mfhi ($t_2,$a_4,$b_1) 1277 $ADDU $c_3,$t_1 1278 sltu $at,$c_3,$t_1 1279 $MULTU ($a_5,$b_0) # mul_add_c(a[5],b[0],c3,c1,c2); 1280 $ADDU $t_2,$at 1281 $ADDU $c_1,$t_2 1282 sltu $at,$c_1,$t_2 1283 $ADDU $c_2,$at 1284 mflo ($t_1,$a_5,$b_0) 1285 mfhi ($t_2,$a_5,$b_0) 1286 $ADDU $c_3,$t_1 1287 sltu $at,$c_3,$t_1 1288 $MULTU ($a_6,$b_0) # mul_add_c(a[6],b[0],c1,c2,c3); 1289 $ADDU $t_2,$at 1290 $ADDU $c_1,$t_2 1291 sltu $at,$c_1,$t_2 1292 $ADDU $c_2,$at 1293 $ST $c_3,5*$BNSZ($a0) # r[5]=c3; 1294 1295 mflo ($t_1,$a_6,$b_0) 1296 mfhi ($t_2,$a_6,$b_0) 1297 $ADDU $c_1,$t_1 1298 sltu $at,$c_1,$t_1 1299 $MULTU ($a_5,$b_1) # mul_add_c(a[5],b[1],c1,c2,c3); 1300 $ADDU $t_2,$at 1301 $ADDU $c_2,$t_2 1302 sltu $c_3,$c_2,$t_2 1303 mflo ($t_1,$a_5,$b_1) 1304 mfhi ($t_2,$a_5,$b_1) 1305 $ADDU $c_1,$t_1 1306 sltu $at,$c_1,$t_1 1307 $MULTU ($a_4,$b_2) # mul_add_c(a[4],b[2],c1,c2,c3); 1308 $ADDU $t_2,$at 1309 $ADDU $c_2,$t_2 1310 sltu $at,$c_2,$t_2 1311 $ADDU $c_3,$at 1312 mflo ($t_1,$a_4,$b_2) 1313 mfhi ($t_2,$a_4,$b_2) 1314 $ADDU $c_1,$t_1 1315 sltu $at,$c_1,$t_1 1316 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3); 1317 $ADDU $t_2,$at 1318 $ADDU $c_2,$t_2 1319 sltu $at,$c_2,$t_2 1320 $ADDU $c_3,$at 1321 mflo ($t_1,$a_3,$b_3) 1322 mfhi ($t_2,$a_3,$b_3) 1323 $ADDU $c_1,$t_1 1324 sltu $at,$c_1,$t_1 1325 $MULTU ($a_2,$b_4) # mul_add_c(a[2],b[4],c1,c2,c3); 1326 $ADDU $t_2,$at 1327 $ADDU $c_2,$t_2 1328 sltu $at,$c_2,$t_2 1329 $ADDU $c_3,$at 1330 mflo ($t_1,$a_2,$b_4) 1331 mfhi ($t_2,$a_2,$b_4) 1332 $ADDU $c_1,$t_1 1333 sltu $at,$c_1,$t_1 1334 $MULTU ($a_1,$b_5) # mul_add_c(a[1],b[5],c1,c2,c3); 1335 $ADDU $t_2,$at 1336 $ADDU $c_2,$t_2 1337 sltu $at,$c_2,$t_2 1338 $ADDU $c_3,$at 1339 mflo ($t_1,$a_1,$b_5) 1340 mfhi ($t_2,$a_1,$b_5) 1341 $ADDU $c_1,$t_1 1342 sltu $at,$c_1,$t_1 1343 $MULTU ($a_0,$b_6) # mul_add_c(a[0],b[6],c1,c2,c3); 1344 $ADDU $t_2,$at 1345 $ADDU $c_2,$t_2 1346 sltu $at,$c_2,$t_2 1347 $ADDU $c_3,$at 1348 mflo ($t_1,$a_0,$b_6) 1349 mfhi ($t_2,$a_0,$b_6) 1350 $ADDU $c_1,$t_1 1351 sltu $at,$c_1,$t_1 1352 $MULTU ($a_0,$b_7) # mul_add_c(a[0],b[7],c2,c3,c1); 1353 $ADDU $t_2,$at 1354 $ADDU $c_2,$t_2 1355 sltu $at,$c_2,$t_2 1356 $ADDU $c_3,$at 1357 $ST $c_1,6*$BNSZ($a0) # r[6]=c1; 1358 1359 mflo ($t_1,$a_0,$b_7) 1360 mfhi ($t_2,$a_0,$b_7) 1361 $ADDU $c_2,$t_1 1362 sltu $at,$c_2,$t_1 1363 $MULTU ($a_1,$b_6) # mul_add_c(a[1],b[6],c2,c3,c1); 1364 $ADDU $t_2,$at 1365 $ADDU $c_3,$t_2 1366 sltu $c_1,$c_3,$t_2 1367 mflo ($t_1,$a_1,$b_6) 1368 mfhi ($t_2,$a_1,$b_6) 1369 $ADDU $c_2,$t_1 1370 sltu $at,$c_2,$t_1 1371 $MULTU ($a_2,$b_5) # mul_add_c(a[2],b[5],c2,c3,c1); 1372 $ADDU $t_2,$at 1373 $ADDU $c_3,$t_2 1374 sltu $at,$c_3,$t_2 1375 $ADDU $c_1,$at 1376 mflo ($t_1,$a_2,$b_5) 1377 mfhi ($t_2,$a_2,$b_5) 1378 $ADDU $c_2,$t_1 1379 sltu $at,$c_2,$t_1 1380 $MULTU ($a_3,$b_4) # mul_add_c(a[3],b[4],c2,c3,c1); 1381 $ADDU $t_2,$at 1382 $ADDU $c_3,$t_2 1383 sltu $at,$c_3,$t_2 1384 $ADDU $c_1,$at 1385 mflo ($t_1,$a_3,$b_4) 1386 mfhi ($t_2,$a_3,$b_4) 1387 $ADDU $c_2,$t_1 1388 sltu $at,$c_2,$t_1 1389 $MULTU ($a_4,$b_3) # mul_add_c(a[4],b[3],c2,c3,c1); 1390 $ADDU $t_2,$at 1391 $ADDU $c_3,$t_2 1392 sltu $at,$c_3,$t_2 1393 $ADDU $c_1,$at 1394 mflo ($t_1,$a_4,$b_3) 1395 mfhi ($t_2,$a_4,$b_3) 1396 $ADDU $c_2,$t_1 1397 sltu $at,$c_2,$t_1 1398 $MULTU ($a_5,$b_2) # mul_add_c(a[5],b[2],c2,c3,c1); 1399 $ADDU $t_2,$at 1400 $ADDU $c_3,$t_2 1401 sltu $at,$c_3,$t_2 1402 $ADDU $c_1,$at 1403 mflo ($t_1,$a_5,$b_2) 1404 mfhi ($t_2,$a_5,$b_2) 1405 $ADDU $c_2,$t_1 1406 sltu $at,$c_2,$t_1 1407 $MULTU ($a_6,$b_1) # mul_add_c(a[6],b[1],c2,c3,c1); 1408 $ADDU $t_2,$at 1409 $ADDU $c_3,$t_2 1410 sltu $at,$c_3,$t_2 1411 $ADDU $c_1,$at 1412 mflo ($t_1,$a_6,$b_1) 1413 mfhi ($t_2,$a_6,$b_1) 1414 $ADDU $c_2,$t_1 1415 sltu $at,$c_2,$t_1 1416 $MULTU ($a_7,$b_0) # mul_add_c(a[7],b[0],c2,c3,c1); 1417 $ADDU $t_2,$at 1418 $ADDU $c_3,$t_2 1419 sltu $at,$c_3,$t_2 1420 $ADDU $c_1,$at 1421 mflo ($t_1,$a_7,$b_0) 1422 mfhi ($t_2,$a_7,$b_0) 1423 $ADDU $c_2,$t_1 1424 sltu $at,$c_2,$t_1 1425 $MULTU ($a_7,$b_1) # mul_add_c(a[7],b[1],c3,c1,c2); 1426 $ADDU $t_2,$at 1427 $ADDU $c_3,$t_2 1428 sltu $at,$c_3,$t_2 1429 $ADDU $c_1,$at 1430 $ST $c_2,7*$BNSZ($a0) # r[7]=c2; 1431 1432 mflo ($t_1,$a_7,$b_1) 1433 mfhi ($t_2,$a_7,$b_1) 1434 $ADDU $c_3,$t_1 1435 sltu $at,$c_3,$t_1 1436 $MULTU ($a_6,$b_2) # mul_add_c(a[6],b[2],c3,c1,c2); 1437 $ADDU $t_2,$at 1438 $ADDU $c_1,$t_2 1439 sltu $c_2,$c_1,$t_2 1440 mflo ($t_1,$a_6,$b_2) 1441 mfhi ($t_2,$a_6,$b_2) 1442 $ADDU $c_3,$t_1 1443 sltu $at,$c_3,$t_1 1444 $MULTU ($a_5,$b_3) # mul_add_c(a[5],b[3],c3,c1,c2); 1445 $ADDU $t_2,$at 1446 $ADDU $c_1,$t_2 1447 sltu $at,$c_1,$t_2 1448 $ADDU $c_2,$at 1449 mflo ($t_1,$a_5,$b_3) 1450 mfhi ($t_2,$a_5,$b_3) 1451 $ADDU $c_3,$t_1 1452 sltu $at,$c_3,$t_1 1453 $MULTU ($a_4,$b_4) # mul_add_c(a[4],b[4],c3,c1,c2); 1454 $ADDU $t_2,$at 1455 $ADDU $c_1,$t_2 1456 sltu $at,$c_1,$t_2 1457 $ADDU $c_2,$at 1458 mflo ($t_1,$a_4,$b_4) 1459 mfhi ($t_2,$a_4,$b_4) 1460 $ADDU $c_3,$t_1 1461 sltu $at,$c_3,$t_1 1462 $MULTU ($a_3,$b_5) # mul_add_c(a[3],b[5],c3,c1,c2); 1463 $ADDU $t_2,$at 1464 $ADDU $c_1,$t_2 1465 sltu $at,$c_1,$t_2 1466 $ADDU $c_2,$at 1467 mflo ($t_1,$a_3,$b_5) 1468 mfhi ($t_2,$a_3,$b_5) 1469 $ADDU $c_3,$t_1 1470 sltu $at,$c_3,$t_1 1471 $MULTU ($a_2,$b_6) # mul_add_c(a[2],b[6],c3,c1,c2); 1472 $ADDU $t_2,$at 1473 $ADDU $c_1,$t_2 1474 sltu $at,$c_1,$t_2 1475 $ADDU $c_2,$at 1476 mflo ($t_1,$a_2,$b_6) 1477 mfhi ($t_2,$a_2,$b_6) 1478 $ADDU $c_3,$t_1 1479 sltu $at,$c_3,$t_1 1480 $MULTU ($a_1,$b_7) # mul_add_c(a[1],b[7],c3,c1,c2); 1481 $ADDU $t_2,$at 1482 $ADDU $c_1,$t_2 1483 sltu $at,$c_1,$t_2 1484 $ADDU $c_2,$at 1485 mflo ($t_1,$a_1,$b_7) 1486 mfhi ($t_2,$a_1,$b_7) 1487 $ADDU $c_3,$t_1 1488 sltu $at,$c_3,$t_1 1489 $MULTU ($a_2,$b_7) # mul_add_c(a[2],b[7],c1,c2,c3); 1490 $ADDU $t_2,$at 1491 $ADDU $c_1,$t_2 1492 sltu $at,$c_1,$t_2 1493 $ADDU $c_2,$at 1494 $ST $c_3,8*$BNSZ($a0) # r[8]=c3; 1495 1496 mflo ($t_1,$a_2,$b_7) 1497 mfhi ($t_2,$a_2,$b_7) 1498 $ADDU $c_1,$t_1 1499 sltu $at,$c_1,$t_1 1500 $MULTU ($a_3,$b_6) # mul_add_c(a[3],b[6],c1,c2,c3); 1501 $ADDU $t_2,$at 1502 $ADDU $c_2,$t_2 1503 sltu $c_3,$c_2,$t_2 1504 mflo ($t_1,$a_3,$b_6) 1505 mfhi ($t_2,$a_3,$b_6) 1506 $ADDU $c_1,$t_1 1507 sltu $at,$c_1,$t_1 1508 $MULTU ($a_4,$b_5) # mul_add_c(a[4],b[5],c1,c2,c3); 1509 $ADDU $t_2,$at 1510 $ADDU $c_2,$t_2 1511 sltu $at,$c_2,$t_2 1512 $ADDU $c_3,$at 1513 mflo ($t_1,$a_4,$b_5) 1514 mfhi ($t_2,$a_4,$b_5) 1515 $ADDU $c_1,$t_1 1516 sltu $at,$c_1,$t_1 1517 $MULTU ($a_5,$b_4) # mul_add_c(a[5],b[4],c1,c2,c3); 1518 $ADDU $t_2,$at 1519 $ADDU $c_2,$t_2 1520 sltu $at,$c_2,$t_2 1521 $ADDU $c_3,$at 1522 mflo ($t_1,$a_5,$b_4) 1523 mfhi ($t_2,$a_5,$b_4) 1524 $ADDU $c_1,$t_1 1525 sltu $at,$c_1,$t_1 1526 $MULTU ($a_6,$b_3) # mul_add_c(a[6],b[3],c1,c2,c3); 1527 $ADDU $t_2,$at 1528 $ADDU $c_2,$t_2 1529 sltu $at,$c_2,$t_2 1530 $ADDU $c_3,$at 1531 mflo ($t_1,$a_6,$b_3) 1532 mfhi ($t_2,$a_6,$b_3) 1533 $ADDU $c_1,$t_1 1534 sltu $at,$c_1,$t_1 1535 $MULTU ($a_7,$b_2) # mul_add_c(a[7],b[2],c1,c2,c3); 1536 $ADDU $t_2,$at 1537 $ADDU $c_2,$t_2 1538 sltu $at,$c_2,$t_2 1539 $ADDU $c_3,$at 1540 mflo ($t_1,$a_7,$b_2) 1541 mfhi ($t_2,$a_7,$b_2) 1542 $ADDU $c_1,$t_1 1543 sltu $at,$c_1,$t_1 1544 $MULTU ($a_7,$b_3) # mul_add_c(a[7],b[3],c2,c3,c1); 1545 $ADDU $t_2,$at 1546 $ADDU $c_2,$t_2 1547 sltu $at,$c_2,$t_2 1548 $ADDU $c_3,$at 1549 $ST $c_1,9*$BNSZ($a0) # r[9]=c1; 1550 1551 mflo ($t_1,$a_7,$b_3) 1552 mfhi ($t_2,$a_7,$b_3) 1553 $ADDU $c_2,$t_1 1554 sltu $at,$c_2,$t_1 1555 $MULTU ($a_6,$b_4) # mul_add_c(a[6],b[4],c2,c3,c1); 1556 $ADDU $t_2,$at 1557 $ADDU $c_3,$t_2 1558 sltu $c_1,$c_3,$t_2 1559 mflo ($t_1,$a_6,$b_4) 1560 mfhi ($t_2,$a_6,$b_4) 1561 $ADDU $c_2,$t_1 1562 sltu $at,$c_2,$t_1 1563 $MULTU ($a_5,$b_5) # mul_add_c(a[5],b[5],c2,c3,c1); 1564 $ADDU $t_2,$at 1565 $ADDU $c_3,$t_2 1566 sltu $at,$c_3,$t_2 1567 $ADDU $c_1,$at 1568 mflo ($t_1,$a_5,$b_5) 1569 mfhi ($t_2,$a_5,$b_5) 1570 $ADDU $c_2,$t_1 1571 sltu $at,$c_2,$t_1 1572 $MULTU ($a_4,$b_6) # mul_add_c(a[4],b[6],c2,c3,c1); 1573 $ADDU $t_2,$at 1574 $ADDU $c_3,$t_2 1575 sltu $at,$c_3,$t_2 1576 $ADDU $c_1,$at 1577 mflo ($t_1,$a_4,$b_6) 1578 mfhi ($t_2,$a_4,$b_6) 1579 $ADDU $c_2,$t_1 1580 sltu $at,$c_2,$t_1 1581 $MULTU ($a_3,$b_7) # mul_add_c(a[3],b[7],c2,c3,c1); 1582 $ADDU $t_2,$at 1583 $ADDU $c_3,$t_2 1584 sltu $at,$c_3,$t_2 1585 $ADDU $c_1,$at 1586 mflo ($t_1,$a_3,$b_7) 1587 mfhi ($t_2,$a_3,$b_7) 1588 $ADDU $c_2,$t_1 1589 sltu $at,$c_2,$t_1 1590 $MULTU ($a_4,$b_7) # mul_add_c(a[4],b[7],c3,c1,c2); 1591 $ADDU $t_2,$at 1592 $ADDU $c_3,$t_2 1593 sltu $at,$c_3,$t_2 1594 $ADDU $c_1,$at 1595 $ST $c_2,10*$BNSZ($a0) # r[10]=c2; 1596 1597 mflo ($t_1,$a_4,$b_7) 1598 mfhi ($t_2,$a_4,$b_7) 1599 $ADDU $c_3,$t_1 1600 sltu $at,$c_3,$t_1 1601 $MULTU ($a_5,$b_6) # mul_add_c(a[5],b[6],c3,c1,c2); 1602 $ADDU $t_2,$at 1603 $ADDU $c_1,$t_2 1604 sltu $c_2,$c_1,$t_2 1605 mflo ($t_1,$a_5,$b_6) 1606 mfhi ($t_2,$a_5,$b_6) 1607 $ADDU $c_3,$t_1 1608 sltu $at,$c_3,$t_1 1609 $MULTU ($a_6,$b_5) # mul_add_c(a[6],b[5],c3,c1,c2); 1610 $ADDU $t_2,$at 1611 $ADDU $c_1,$t_2 1612 sltu $at,$c_1,$t_2 1613 $ADDU $c_2,$at 1614 mflo ($t_1,$a_6,$b_5) 1615 mfhi ($t_2,$a_6,$b_5) 1616 $ADDU $c_3,$t_1 1617 sltu $at,$c_3,$t_1 1618 $MULTU ($a_7,$b_4) # mul_add_c(a[7],b[4],c3,c1,c2); 1619 $ADDU $t_2,$at 1620 $ADDU $c_1,$t_2 1621 sltu $at,$c_1,$t_2 1622 $ADDU $c_2,$at 1623 mflo ($t_1,$a_7,$b_4) 1624 mfhi ($t_2,$a_7,$b_4) 1625 $ADDU $c_3,$t_1 1626 sltu $at,$c_3,$t_1 1627 $MULTU ($a_7,$b_5) # mul_add_c(a[7],b[5],c1,c2,c3); 1628 $ADDU $t_2,$at 1629 $ADDU $c_1,$t_2 1630 sltu $at,$c_1,$t_2 1631 $ADDU $c_2,$at 1632 $ST $c_3,11*$BNSZ($a0) # r[11]=c3; 1633 1634 mflo ($t_1,$a_7,$b_5) 1635 mfhi ($t_2,$a_7,$b_5) 1636 $ADDU $c_1,$t_1 1637 sltu $at,$c_1,$t_1 1638 $MULTU ($a_6,$b_6) # mul_add_c(a[6],b[6],c1,c2,c3); 1639 $ADDU $t_2,$at 1640 $ADDU $c_2,$t_2 1641 sltu $c_3,$c_2,$t_2 1642 mflo ($t_1,$a_6,$b_6) 1643 mfhi ($t_2,$a_6,$b_6) 1644 $ADDU $c_1,$t_1 1645 sltu $at,$c_1,$t_1 1646 $MULTU ($a_5,$b_7) # mul_add_c(a[5],b[7],c1,c2,c3); 1647 $ADDU $t_2,$at 1648 $ADDU $c_2,$t_2 1649 sltu $at,$c_2,$t_2 1650 $ADDU $c_3,$at 1651 mflo ($t_1,$a_5,$b_7) 1652 mfhi ($t_2,$a_5,$b_7) 1653 $ADDU $c_1,$t_1 1654 sltu $at,$c_1,$t_1 1655 $MULTU ($a_6,$b_7) # mul_add_c(a[6],b[7],c2,c3,c1); 1656 $ADDU $t_2,$at 1657 $ADDU $c_2,$t_2 1658 sltu $at,$c_2,$t_2 1659 $ADDU $c_3,$at 1660 $ST $c_1,12*$BNSZ($a0) # r[12]=c1; 1661 1662 mflo ($t_1,$a_6,$b_7) 1663 mfhi ($t_2,$a_6,$b_7) 1664 $ADDU $c_2,$t_1 1665 sltu $at,$c_2,$t_1 1666 $MULTU ($a_7,$b_6) # mul_add_c(a[7],b[6],c2,c3,c1); 1667 $ADDU $t_2,$at 1668 $ADDU $c_3,$t_2 1669 sltu $c_1,$c_3,$t_2 1670 mflo ($t_1,$a_7,$b_6) 1671 mfhi ($t_2,$a_7,$b_6) 1672 $ADDU $c_2,$t_1 1673 sltu $at,$c_2,$t_1 1674 $MULTU ($a_7,$b_7) # mul_add_c(a[7],b[7],c3,c1,c2); 1675 $ADDU $t_2,$at 1676 $ADDU $c_3,$t_2 1677 sltu $at,$c_3,$t_2 1678 $ADDU $c_1,$at 1679 $ST $c_2,13*$BNSZ($a0) # r[13]=c2; 1680 1681 mflo ($t_1,$a_7,$b_7) 1682 mfhi ($t_2,$a_7,$b_7) 1683 $ADDU $c_3,$t_1 1684 sltu $at,$c_3,$t_1 1685 $ADDU $t_2,$at 1686 $ADDU $c_1,$t_2 1687 $ST $c_3,14*$BNSZ($a0) # r[14]=c3; 1688 $ST $c_1,15*$BNSZ($a0) # r[15]=c1; 1689 1690 .set noreorder 1691___ 1692$code.=<<___ if ($flavour =~ /nubi/i); 1693 $REG_L $s5,10*$SZREG($sp) 1694 $REG_L $s4,9*$SZREG($sp) 1695 $REG_L $s3,8*$SZREG($sp) 1696 $REG_L $s2,7*$SZREG($sp) 1697 $REG_L $s1,6*$SZREG($sp) 1698 $REG_L $s0,5*$SZREG($sp) 1699 $REG_L $t3,4*$SZREG($sp) 1700 $REG_L $t2,3*$SZREG($sp) 1701 $REG_L $t1,2*$SZREG($sp) 1702 $REG_L $t0,1*$SZREG($sp) 1703 $REG_L $gp,0*$SZREG($sp) 1704 jr $ra 1705 $PTR_ADD $sp,12*$SZREG 1706___ 1707$code.=<<___ if ($flavour !~ /nubi/i); 1708 $REG_L $s5,5*$SZREG($sp) 1709 $REG_L $s4,4*$SZREG($sp) 1710 $REG_L $s3,3*$SZREG($sp) 1711 $REG_L $s2,2*$SZREG($sp) 1712 $REG_L $s1,1*$SZREG($sp) 1713 $REG_L $s0,0*$SZREG($sp) 1714 jr $ra 1715 $PTR_ADD $sp,6*$SZREG 1716___ 1717$code.=<<___; 1718.end bn_mul_comba8 1719 1720.align 5 1721.globl bn_mul_comba4 1722.ent bn_mul_comba4 1723bn_mul_comba4: 1724___ 1725$code.=<<___ if ($flavour =~ /nubi/i); 1726 .frame $sp,6*$SZREG,$ra 1727 .mask 0x8000f008,-$SZREG 1728 .set noreorder 1729 $PTR_SUB $sp,6*$SZREG 1730 $REG_S $ra,5*$SZREG($sp) 1731 $REG_S $t3,4*$SZREG($sp) 1732 $REG_S $t2,3*$SZREG($sp) 1733 $REG_S $t1,2*$SZREG($sp) 1734 $REG_S $t0,1*$SZREG($sp) 1735 $REG_S $gp,0*$SZREG($sp) 1736___ 1737$code.=<<___; 1738 .set reorder 1739 $LD $a_0,0($a1) 1740 $LD $b_0,0($a2) 1741 $LD $a_1,$BNSZ($a1) 1742 $LD $a_2,2*$BNSZ($a1) 1743 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3); 1744 $LD $a_3,3*$BNSZ($a1) 1745 $LD $b_1,$BNSZ($a2) 1746 $LD $b_2,2*$BNSZ($a2) 1747 $LD $b_3,3*$BNSZ($a2) 1748 mflo ($c_1,$a_0,$b_0) 1749 mfhi ($c_2,$a_0,$b_0) 1750 $ST $c_1,0($a0) 1751 1752 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1); 1753 mflo ($t_1,$a_0,$b_1) 1754 mfhi ($t_2,$a_0,$b_1) 1755 $ADDU $c_2,$t_1 1756 sltu $at,$c_2,$t_1 1757 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1); 1758 $ADDU $c_3,$t_2,$at 1759 mflo ($t_1,$a_1,$b_0) 1760 mfhi ($t_2,$a_1,$b_0) 1761 $ADDU $c_2,$t_1 1762 sltu $at,$c_2,$t_1 1763 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2); 1764 $ADDU $t_2,$at 1765 $ADDU $c_3,$t_2 1766 sltu $c_1,$c_3,$t_2 1767 $ST $c_2,$BNSZ($a0) 1768 1769 mflo ($t_1,$a_2,$b_0) 1770 mfhi ($t_2,$a_2,$b_0) 1771 $ADDU $c_3,$t_1 1772 sltu $at,$c_3,$t_1 1773 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2); 1774 $ADDU $t_2,$at 1775 $ADDU $c_1,$t_2 1776 mflo ($t_1,$a_1,$b_1) 1777 mfhi ($t_2,$a_1,$b_1) 1778 $ADDU $c_3,$t_1 1779 sltu $at,$c_3,$t_1 1780 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2); 1781 $ADDU $t_2,$at 1782 $ADDU $c_1,$t_2 1783 sltu $c_2,$c_1,$t_2 1784 mflo ($t_1,$a_0,$b_2) 1785 mfhi ($t_2,$a_0,$b_2) 1786 $ADDU $c_3,$t_1 1787 sltu $at,$c_3,$t_1 1788 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3); 1789 $ADDU $t_2,$at 1790 $ADDU $c_1,$t_2 1791 sltu $at,$c_1,$t_2 1792 $ADDU $c_2,$at 1793 $ST $c_3,2*$BNSZ($a0) 1794 1795 mflo ($t_1,$a_0,$b_3) 1796 mfhi ($t_2,$a_0,$b_3) 1797 $ADDU $c_1,$t_1 1798 sltu $at,$c_1,$t_1 1799 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3); 1800 $ADDU $t_2,$at 1801 $ADDU $c_2,$t_2 1802 sltu $c_3,$c_2,$t_2 1803 mflo ($t_1,$a_1,$b_2) 1804 mfhi ($t_2,$a_1,$b_2) 1805 $ADDU $c_1,$t_1 1806 sltu $at,$c_1,$t_1 1807 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3); 1808 $ADDU $t_2,$at 1809 $ADDU $c_2,$t_2 1810 sltu $at,$c_2,$t_2 1811 $ADDU $c_3,$at 1812 mflo ($t_1,$a_2,$b_1) 1813 mfhi ($t_2,$a_2,$b_1) 1814 $ADDU $c_1,$t_1 1815 sltu $at,$c_1,$t_1 1816 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3); 1817 $ADDU $t_2,$at 1818 $ADDU $c_2,$t_2 1819 sltu $at,$c_2,$t_2 1820 $ADDU $c_3,$at 1821 mflo ($t_1,$a_3,$b_0) 1822 mfhi ($t_2,$a_3,$b_0) 1823 $ADDU $c_1,$t_1 1824 sltu $at,$c_1,$t_1 1825 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1); 1826 $ADDU $t_2,$at 1827 $ADDU $c_2,$t_2 1828 sltu $at,$c_2,$t_2 1829 $ADDU $c_3,$at 1830 $ST $c_1,3*$BNSZ($a0) 1831 1832 mflo ($t_1,$a_3,$b_1) 1833 mfhi ($t_2,$a_3,$b_1) 1834 $ADDU $c_2,$t_1 1835 sltu $at,$c_2,$t_1 1836 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1); 1837 $ADDU $t_2,$at 1838 $ADDU $c_3,$t_2 1839 sltu $c_1,$c_3,$t_2 1840 mflo ($t_1,$a_2,$b_2) 1841 mfhi ($t_2,$a_2,$b_2) 1842 $ADDU $c_2,$t_1 1843 sltu $at,$c_2,$t_1 1844 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1); 1845 $ADDU $t_2,$at 1846 $ADDU $c_3,$t_2 1847 sltu $at,$c_3,$t_2 1848 $ADDU $c_1,$at 1849 mflo ($t_1,$a_1,$b_3) 1850 mfhi ($t_2,$a_1,$b_3) 1851 $ADDU $c_2,$t_1 1852 sltu $at,$c_2,$t_1 1853 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2); 1854 $ADDU $t_2,$at 1855 $ADDU $c_3,$t_2 1856 sltu $at,$c_3,$t_2 1857 $ADDU $c_1,$at 1858 $ST $c_2,4*$BNSZ($a0) 1859 1860 mflo ($t_1,$a_2,$b_3) 1861 mfhi ($t_2,$a_2,$b_3) 1862 $ADDU $c_3,$t_1 1863 sltu $at,$c_3,$t_1 1864 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2); 1865 $ADDU $t_2,$at 1866 $ADDU $c_1,$t_2 1867 sltu $c_2,$c_1,$t_2 1868 mflo ($t_1,$a_3,$b_2) 1869 mfhi ($t_2,$a_3,$b_2) 1870 $ADDU $c_3,$t_1 1871 sltu $at,$c_3,$t_1 1872 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3); 1873 $ADDU $t_2,$at 1874 $ADDU $c_1,$t_2 1875 sltu $at,$c_1,$t_2 1876 $ADDU $c_2,$at 1877 $ST $c_3,5*$BNSZ($a0) 1878 1879 mflo ($t_1,$a_3,$b_3) 1880 mfhi ($t_2,$a_3,$b_3) 1881 $ADDU $c_1,$t_1 1882 sltu $at,$c_1,$t_1 1883 $ADDU $t_2,$at 1884 $ADDU $c_2,$t_2 1885 $ST $c_1,6*$BNSZ($a0) 1886 $ST $c_2,7*$BNSZ($a0) 1887 1888 .set noreorder 1889___ 1890$code.=<<___ if ($flavour =~ /nubi/i); 1891 $REG_L $t3,4*$SZREG($sp) 1892 $REG_L $t2,3*$SZREG($sp) 1893 $REG_L $t1,2*$SZREG($sp) 1894 $REG_L $t0,1*$SZREG($sp) 1895 $REG_L $gp,0*$SZREG($sp) 1896 $PTR_ADD $sp,6*$SZREG 1897___ 1898$code.=<<___; 1899 jr $ra 1900 nop 1901.end bn_mul_comba4 1902___ 1903 1904($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); 1905 1906sub add_c2 () { 1907my ($hi,$lo,$c0,$c1,$c2, 1908 $warm, # !$warm denotes first call with specific sequence of 1909 # $c_[XYZ] when there is no Z-carry to accumulate yet; 1910 $an,$bn # these two are arguments for multiplication which 1911 # result is used in *next* step [which is why it's 1912 # commented as "forward multiplication" below]; 1913 )=@_; 1914$code.=<<___; 1915 $ADDU $c0,$lo 1916 sltu $at,$c0,$lo 1917 $MULTU ($an,$bn) # forward multiplication 1918 $ADDU $c0,$lo 1919 $ADDU $at,$hi 1920 sltu $lo,$c0,$lo 1921 $ADDU $c1,$at 1922 $ADDU $hi,$lo 1923___ 1924$code.=<<___ if (!$warm); 1925 sltu $c2,$c1,$at 1926 $ADDU $c1,$hi 1927___ 1928$code.=<<___ if ($warm); 1929 sltu $at,$c1,$at 1930 $ADDU $c1,$hi 1931 $ADDU $c2,$at 1932___ 1933$code.=<<___; 1934 sltu $hi,$c1,$hi 1935 $ADDU $c2,$hi 1936 mflo ($lo,$an,$bn) 1937 mfhi ($hi,$an,$bn) 1938___ 1939} 1940 1941$code.=<<___; 1942 1943.align 5 1944.globl bn_sqr_comba8 1945.ent bn_sqr_comba8 1946bn_sqr_comba8: 1947___ 1948$code.=<<___ if ($flavour =~ /nubi/i); 1949 .frame $sp,6*$SZREG,$ra 1950 .mask 0x8000f008,-$SZREG 1951 .set noreorder 1952 $PTR_SUB $sp,6*$SZREG 1953 $REG_S $ra,5*$SZREG($sp) 1954 $REG_S $t3,4*$SZREG($sp) 1955 $REG_S $t2,3*$SZREG($sp) 1956 $REG_S $t1,2*$SZREG($sp) 1957 $REG_S $t0,1*$SZREG($sp) 1958 $REG_S $gp,0*$SZREG($sp) 1959___ 1960$code.=<<___; 1961 .set reorder 1962 $LD $a_0,0($a1) 1963 $LD $a_1,$BNSZ($a1) 1964 $LD $a_2,2*$BNSZ($a1) 1965 $LD $a_3,3*$BNSZ($a1) 1966 1967 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3); 1968 $LD $a_4,4*$BNSZ($a1) 1969 $LD $a_5,5*$BNSZ($a1) 1970 $LD $a_6,6*$BNSZ($a1) 1971 $LD $a_7,7*$BNSZ($a1) 1972 mflo ($c_1,$a_0,$a_0) 1973 mfhi ($c_2,$a_0,$a_0) 1974 $ST $c_1,0($a0) 1975 1976 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1); 1977 mflo ($t_1,$a_0,$a_1) 1978 mfhi ($t_2,$a_0,$a_1) 1979 slt $c_1,$t_2,$zero 1980 $SLL $t_2,1 1981 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2); 1982 slt $a2,$t_1,$zero 1983 $ADDU $t_2,$a2 1984 $SLL $t_1,1 1985 $ADDU $c_2,$t_1 1986 sltu $at,$c_2,$t_1 1987 $ADDU $c_3,$t_2,$at 1988 $ST $c_2,$BNSZ($a0) 1989 sltu $at,$c_3,$t_2 1990 $ADDU $c_1,$at 1991 mflo ($t_1,$a_2,$a_0) 1992 mfhi ($t_2,$a_2,$a_0) 1993___ 1994 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 1995 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); 1996$code.=<<___; 1997 $ADDU $c_3,$t_1 1998 sltu $at,$c_3,$t_1 1999 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3); 2000 $ADDU $t_2,$at 2001 $ADDU $c_1,$t_2 2002 sltu $at,$c_1,$t_2 2003 $ADDU $c_2,$at 2004 $ST $c_3,2*$BNSZ($a0) 2005 mflo ($t_1,$a_0,$a_3) 2006 mfhi ($t_2,$a_0,$a_3) 2007___ 2008 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2009 $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3); 2010 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2011 $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1); 2012$code.=<<___; 2013 $ST $c_1,3*$BNSZ($a0) 2014___ 2015 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2016 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); 2017 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2018 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); 2019$code.=<<___; 2020 $ADDU $c_2,$t_1 2021 sltu $at,$c_2,$t_1 2022 $MULTU ($a_0,$a_5) # mul_add_c2(a[0],b[5],c3,c1,c2); 2023 $ADDU $t_2,$at 2024 $ADDU $c_3,$t_2 2025 sltu $at,$c_3,$t_2 2026 $ADDU $c_1,$at 2027 $ST $c_2,4*$BNSZ($a0) 2028 mflo ($t_1,$a_0,$a_5) 2029 mfhi ($t_2,$a_0,$a_5) 2030___ 2031 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2032 $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2); 2033 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2034 $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2); 2035 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2036 $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3); 2037$code.=<<___; 2038 $ST $c_3,5*$BNSZ($a0) 2039___ 2040 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2041 $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3); 2042 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2043 $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3); 2044 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2045 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); 2046$code.=<<___; 2047 $ADDU $c_1,$t_1 2048 sltu $at,$c_1,$t_1 2049 $MULTU ($a_0,$a_7) # mul_add_c2(a[0],b[7],c2,c3,c1); 2050 $ADDU $t_2,$at 2051 $ADDU $c_2,$t_2 2052 sltu $at,$c_2,$t_2 2053 $ADDU $c_3,$at 2054 $ST $c_1,6*$BNSZ($a0) 2055 mflo ($t_1,$a_0,$a_7) 2056 mfhi ($t_2,$a_0,$a_7) 2057___ 2058 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2059 $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1); 2060 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2061 $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1); 2062 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2063 $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1); 2064 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2065 $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2); 2066$code.=<<___; 2067 $ST $c_2,7*$BNSZ($a0) 2068___ 2069 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2070 $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2); 2071 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2072 $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2); 2073 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2074 $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2); 2075$code.=<<___; 2076 $ADDU $c_3,$t_1 2077 sltu $at,$c_3,$t_1 2078 $MULTU ($a_2,$a_7) # mul_add_c2(a[2],b[7],c1,c2,c3); 2079 $ADDU $t_2,$at 2080 $ADDU $c_1,$t_2 2081 sltu $at,$c_1,$t_2 2082 $ADDU $c_2,$at 2083 $ST $c_3,8*$BNSZ($a0) 2084 mflo ($t_1,$a_2,$a_7) 2085 mfhi ($t_2,$a_2,$a_7) 2086___ 2087 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2088 $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3); 2089 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2090 $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3); 2091 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2092 $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1); 2093$code.=<<___; 2094 $ST $c_1,9*$BNSZ($a0) 2095___ 2096 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2097 $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1); 2098 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2099 $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1); 2100$code.=<<___; 2101 $ADDU $c_2,$t_1 2102 sltu $at,$c_2,$t_1 2103 $MULTU ($a_4,$a_7) # mul_add_c2(a[4],b[7],c3,c1,c2); 2104 $ADDU $t_2,$at 2105 $ADDU $c_3,$t_2 2106 sltu $at,$c_3,$t_2 2107 $ADDU $c_1,$at 2108 $ST $c_2,10*$BNSZ($a0) 2109 mflo ($t_1,$a_4,$a_7) 2110 mfhi ($t_2,$a_4,$a_7) 2111___ 2112 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2113 $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2); 2114 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2115 $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3); 2116$code.=<<___; 2117 $ST $c_3,11*$BNSZ($a0) 2118___ 2119 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2120 $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3); 2121$code.=<<___; 2122 $ADDU $c_1,$t_1 2123 sltu $at,$c_1,$t_1 2124 $MULTU ($a_6,$a_7) # mul_add_c2(a[6],b[7],c2,c3,c1); 2125 $ADDU $t_2,$at 2126 $ADDU $c_2,$t_2 2127 sltu $at,$c_2,$t_2 2128 $ADDU $c_3,$at 2129 $ST $c_1,12*$BNSZ($a0) 2130 mflo ($t_1,$a_6,$a_7) 2131 mfhi ($t_2,$a_6,$a_7) 2132___ 2133 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2134 $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2); 2135$code.=<<___; 2136 $ST $c_2,13*$BNSZ($a0) 2137 2138 $ADDU $c_3,$t_1 2139 sltu $at,$c_3,$t_1 2140 $ADDU $t_2,$at 2141 $ADDU $c_1,$t_2 2142 $ST $c_3,14*$BNSZ($a0) 2143 $ST $c_1,15*$BNSZ($a0) 2144 2145 .set noreorder 2146___ 2147$code.=<<___ if ($flavour =~ /nubi/i); 2148 $REG_L $t3,4*$SZREG($sp) 2149 $REG_L $t2,3*$SZREG($sp) 2150 $REG_L $t1,2*$SZREG($sp) 2151 $REG_L $t0,1*$SZREG($sp) 2152 $REG_L $gp,0*$SZREG($sp) 2153 $PTR_ADD $sp,6*$SZREG 2154___ 2155$code.=<<___; 2156 jr $ra 2157 nop 2158.end bn_sqr_comba8 2159 2160.align 5 2161.globl bn_sqr_comba4 2162.ent bn_sqr_comba4 2163bn_sqr_comba4: 2164___ 2165$code.=<<___ if ($flavour =~ /nubi/i); 2166 .frame $sp,6*$SZREG,$ra 2167 .mask 0x8000f008,-$SZREG 2168 .set noreorder 2169 $PTR_SUB $sp,6*$SZREG 2170 $REG_S $ra,5*$SZREG($sp) 2171 $REG_S $t3,4*$SZREG($sp) 2172 $REG_S $t2,3*$SZREG($sp) 2173 $REG_S $t1,2*$SZREG($sp) 2174 $REG_S $t0,1*$SZREG($sp) 2175 $REG_S $gp,0*$SZREG($sp) 2176___ 2177$code.=<<___; 2178 .set reorder 2179 $LD $a_0,0($a1) 2180 $LD $a_1,$BNSZ($a1) 2181 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3); 2182 $LD $a_2,2*$BNSZ($a1) 2183 $LD $a_3,3*$BNSZ($a1) 2184 mflo ($c_1,$a_0,$a_0) 2185 mfhi ($c_2,$a_0,$a_0) 2186 $ST $c_1,0($a0) 2187 2188 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1); 2189 mflo ($t_1,$a_0,$a_1) 2190 mfhi ($t_2,$a_0,$a_1) 2191 slt $c_1,$t_2,$zero 2192 $SLL $t_2,1 2193 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2); 2194 slt $a2,$t_1,$zero 2195 $ADDU $t_2,$a2 2196 $SLL $t_1,1 2197 $ADDU $c_2,$t_1 2198 sltu $at,$c_2,$t_1 2199 $ADDU $c_3,$t_2,$at 2200 $ST $c_2,$BNSZ($a0) 2201 sltu $at,$c_3,$t_2 2202 $ADDU $c_1,$at 2203 mflo ($t_1,$a_2,$a_0) 2204 mfhi ($t_2,$a_2,$a_0) 2205___ 2206 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2207 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); 2208$code.=<<___; 2209 $ADDU $c_3,$t_1 2210 sltu $at,$c_3,$t_1 2211 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3); 2212 $ADDU $t_2,$at 2213 $ADDU $c_1,$t_2 2214 sltu $at,$c_1,$t_2 2215 $ADDU $c_2,$at 2216 $ST $c_3,2*$BNSZ($a0) 2217 mflo ($t_1,$a_0,$a_3) 2218 mfhi ($t_2,$a_0,$a_3) 2219___ 2220 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2221 $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3); 2222 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2223 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); 2224$code.=<<___; 2225 $ST $c_1,3*$BNSZ($a0) 2226___ 2227 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2228 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); 2229$code.=<<___; 2230 $ADDU $c_2,$t_1 2231 sltu $at,$c_2,$t_1 2232 $MULTU ($a_2,$a_3) # mul_add_c2(a[2],b[3],c3,c1,c2); 2233 $ADDU $t_2,$at 2234 $ADDU $c_3,$t_2 2235 sltu $at,$c_3,$t_2 2236 $ADDU $c_1,$at 2237 $ST $c_2,4*$BNSZ($a0) 2238 mflo ($t_1,$a_2,$a_3) 2239 mfhi ($t_2,$a_2,$a_3) 2240___ 2241 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2242 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); 2243$code.=<<___; 2244 $ST $c_3,5*$BNSZ($a0) 2245 2246 $ADDU $c_1,$t_1 2247 sltu $at,$c_1,$t_1 2248 $ADDU $t_2,$at 2249 $ADDU $c_2,$t_2 2250 $ST $c_1,6*$BNSZ($a0) 2251 $ST $c_2,7*$BNSZ($a0) 2252 2253 .set noreorder 2254___ 2255$code.=<<___ if ($flavour =~ /nubi/i); 2256 $REG_L $t3,4*$SZREG($sp) 2257 $REG_L $t2,3*$SZREG($sp) 2258 $REG_L $t1,2*$SZREG($sp) 2259 $REG_L $t0,1*$SZREG($sp) 2260 $REG_L $gp,0*$SZREG($sp) 2261 $PTR_ADD $sp,6*$SZREG 2262___ 2263$code.=<<___; 2264 jr $ra 2265 nop 2266.end bn_sqr_comba4 2267___ 2268print $code; 2269close STDOUT or die "error closing STDOUT: $!"; 2270