1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# Poly1305 hash for MIPS64. 18# 19# May 2016 20# 21# Numbers are cycles per processed byte with poly1305_blocks alone. 22# 23# IALU/gcc 24# R1x000 5.64/+120% (big-endian) 25# Octeon II 3.80/+280% (little-endian) 26 27###################################################################### 28# There is a number of MIPS ABI in use, O32 and N32/64 are most 29# widely used. Then there is a new contender: NUBI. It appears that if 30# one picks the latter, it's possible to arrange code in ABI neutral 31# manner. Therefore let's stick to NUBI register layout: 32# 33($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); 34($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 35($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); 36($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); 37# 38# The return value is placed in $a0. Following coding rules facilitate 39# interoperability: 40# 41# - never ever touch $tp, "thread pointer", former $gp [o32 can be 42# excluded from the rule, because it's specified volatile]; 43# - copy return value to $t0, former $v0 [or to $a0 if you're adapting 44# old code]; 45# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; 46# 47# For reference here is register layout for N32/64 MIPS ABIs: 48# 49# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 50# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 51# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 52# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 53# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 54# 55# <appro@openssl.org> 56# 57###################################################################### 58 59# $output is the last argument if it looks like a file (it has an extension) 60# $flavour is the first argument if it doesn't look like a file 61$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 62# supported flavours are o32,n32,64,nubi32,nubi64, default is o32 63$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32"; 64 65die "MIPS64 only" unless ($flavour =~ /64|n32/i); 66 67$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; 68$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; 69 70($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); 71($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); 72 73$code.=<<___; 74#include "mips_arch.h" 75 76#ifdef MIPSEB 77# define MSB 0 78# define LSB 7 79#else 80# define MSB 7 81# define LSB 0 82#endif 83 84.text 85.set noat 86.set noreorder 87 88.align 5 89.globl poly1305_init 90.ent poly1305_init 91poly1305_init: 92 .frame $sp,0,$ra 93 .set reorder 94 95 sd $zero,0($ctx) 96 sd $zero,8($ctx) 97 sd $zero,16($ctx) 98 99 beqz $inp,.Lno_key 100 101#if defined(_MIPS_ARCH_MIPS64R6) 102 ld $in0,0($inp) 103 ld $in1,8($inp) 104#else 105 ldl $in0,0+MSB($inp) 106 ldl $in1,8+MSB($inp) 107 ldr $in0,0+LSB($inp) 108 ldr $in1,8+LSB($inp) 109#endif 110#ifdef MIPSEB 111# if defined(_MIPS_ARCH_MIPS64R2) 112 dsbh $in0,$in0 # byte swap 113 dsbh $in1,$in1 114 dshd $in0,$in0 115 dshd $in1,$in1 116# else 117 ori $tmp0,$zero,0xFF 118 dsll $tmp2,$tmp0,32 119 or $tmp0,$tmp2 # 0x000000FF000000FF 120 121 and $tmp1,$in0,$tmp0 # byte swap 122 and $tmp3,$in1,$tmp0 123 dsrl $tmp2,$in0,24 124 dsrl $tmp4,$in1,24 125 dsll $tmp1,24 126 dsll $tmp3,24 127 and $tmp2,$tmp0 128 and $tmp4,$tmp0 129 dsll $tmp0,8 # 0x0000FF000000FF00 130 or $tmp1,$tmp2 131 or $tmp3,$tmp4 132 and $tmp2,$in0,$tmp0 133 and $tmp4,$in1,$tmp0 134 dsrl $in0,8 135 dsrl $in1,8 136 dsll $tmp2,8 137 dsll $tmp4,8 138 and $in0,$tmp0 139 and $in1,$tmp0 140 or $tmp1,$tmp2 141 or $tmp3,$tmp4 142 or $in0,$tmp1 143 or $in1,$tmp3 144 dsrl $tmp1,$in0,32 145 dsrl $tmp3,$in1,32 146 dsll $in0,32 147 dsll $in1,32 148 or $in0,$tmp1 149 or $in1,$tmp3 150# endif 151#endif 152 li $tmp0,1 153 dsll $tmp0,32 154 daddiu $tmp0,-63 155 dsll $tmp0,28 156 daddiu $tmp0,-1 # 0ffffffc0fffffff 157 158 and $in0,$tmp0 159 daddiu $tmp0,-3 # 0ffffffc0ffffffc 160 and $in1,$tmp0 161 162 sd $in0,24($ctx) 163 dsrl $tmp0,$in1,2 164 sd $in1,32($ctx) 165 daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) 166 sd $tmp0,40($ctx) 167 168.Lno_key: 169 li $v0,0 # return 0 170 jr $ra 171.end poly1305_init 172___ 173{ 174my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) = 175 ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); 176 177$code.=<<___; 178.align 5 179.globl poly1305_blocks 180.ent poly1305_blocks 181poly1305_blocks: 182 .set noreorder 183 dsrl $len,4 # number of complete blocks 184 bnez $len,poly1305_blocks_internal 185 nop 186 jr $ra 187 nop 188.end poly1305_blocks 189 190.align 5 191.ent poly1305_blocks_internal 192poly1305_blocks_internal: 193 .frame $sp,6*8,$ra 194 .mask $SAVED_REGS_MASK,-8 195 .set noreorder 196 dsubu $sp,6*8 197 sd $s5,40($sp) 198 sd $s4,32($sp) 199___ 200$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 201 sd $s3,24($sp) 202 sd $s2,16($sp) 203 sd $s1,8($sp) 204 sd $s0,0($sp) 205___ 206$code.=<<___; 207 .set reorder 208 209 ld $h0,0($ctx) # load hash value 210 ld $h1,8($ctx) 211 ld $h2,16($ctx) 212 213 ld $r0,24($ctx) # load key 214 ld $r1,32($ctx) 215 ld $s1,40($ctx) 216 217.Loop: 218#if defined(_MIPS_ARCH_MIPS64R6) 219 ld $in0,0($inp) # load input 220 ld $in1,8($inp) 221#else 222 ldl $in0,0+MSB($inp) # load input 223 ldl $in1,8+MSB($inp) 224 ldr $in0,0+LSB($inp) 225 ldr $in1,8+LSB($inp) 226#endif 227 daddiu $len,-1 228 daddiu $inp,16 229#ifdef MIPSEB 230# if defined(_MIPS_ARCH_MIPS64R2) 231 dsbh $in0,$in0 # byte swap 232 dsbh $in1,$in1 233 dshd $in0,$in0 234 dshd $in1,$in1 235# else 236 ori $tmp0,$zero,0xFF 237 dsll $tmp2,$tmp0,32 238 or $tmp0,$tmp2 # 0x000000FF000000FF 239 240 and $tmp1,$in0,$tmp0 # byte swap 241 and $tmp3,$in1,$tmp0 242 dsrl $tmp2,$in0,24 243 dsrl $tmp4,$in1,24 244 dsll $tmp1,24 245 dsll $tmp3,24 246 and $tmp2,$tmp0 247 and $tmp4,$tmp0 248 dsll $tmp0,8 # 0x0000FF000000FF00 249 or $tmp1,$tmp2 250 or $tmp3,$tmp4 251 and $tmp2,$in0,$tmp0 252 and $tmp4,$in1,$tmp0 253 dsrl $in0,8 254 dsrl $in1,8 255 dsll $tmp2,8 256 dsll $tmp4,8 257 and $in0,$tmp0 258 and $in1,$tmp0 259 or $tmp1,$tmp2 260 or $tmp3,$tmp4 261 or $in0,$tmp1 262 or $in1,$tmp3 263 dsrl $tmp1,$in0,32 264 dsrl $tmp3,$in1,32 265 dsll $in0,32 266 dsll $in1,32 267 or $in0,$tmp1 268 or $in1,$tmp3 269# endif 270#endif 271 daddu $h0,$in0 # accumulate input 272 daddu $h1,$in1 273 sltu $tmp0,$h0,$in0 274 sltu $tmp1,$h1,$in1 275 daddu $h1,$tmp0 276 277 dmultu ($r0,$h0) # h0*r0 278 daddu $h2,$padbit 279 sltu $tmp0,$h1,$tmp0 280 mflo ($d0,$r0,$h0) 281 mfhi ($d1,$r0,$h0) 282 283 dmultu ($s1,$h1) # h1*5*r1 284 daddu $tmp0,$tmp1 285 daddu $h2,$tmp0 286 mflo ($tmp0,$s1,$h1) 287 mfhi ($tmp1,$s1,$h1) 288 289 dmultu ($r1,$h0) # h0*r1 290 daddu $d0,$tmp0 291 daddu $d1,$tmp1 292 mflo ($tmp2,$r1,$h0) 293 mfhi ($d2,$r1,$h0) 294 sltu $tmp0,$d0,$tmp0 295 daddu $d1,$tmp0 296 297 dmultu ($r0,$h1) # h1*r0 298 daddu $d1,$tmp2 299 sltu $tmp2,$d1,$tmp2 300 mflo ($tmp0,$r0,$h1) 301 mfhi ($tmp1,$r0,$h1) 302 daddu $d2,$tmp2 303 304 dmultu ($s1,$h2) # h2*5*r1 305 daddu $d1,$tmp0 306 daddu $d2,$tmp1 307 mflo ($tmp2,$s1,$h2) 308 309 dmultu ($r0,$h2) # h2*r0 310 sltu $tmp0,$d1,$tmp0 311 daddu $d2,$tmp0 312 mflo ($tmp3,$r0,$h2) 313 314 daddu $d1,$tmp2 315 daddu $d2,$tmp3 316 sltu $tmp2,$d1,$tmp2 317 daddu $d2,$tmp2 318 319 li $tmp0,-4 # final reduction 320 and $tmp0,$d2 321 dsrl $tmp1,$d2,2 322 andi $h2,$d2,3 323 daddu $tmp0,$tmp1 324 daddu $h0,$d0,$tmp0 325 sltu $tmp0,$h0,$tmp0 326 daddu $h1,$d1,$tmp0 327 sltu $tmp0,$h1,$tmp0 328 daddu $h2,$h2,$tmp0 329 330 bnez $len,.Loop 331 332 sd $h0,0($ctx) # store hash value 333 sd $h1,8($ctx) 334 sd $h2,16($ctx) 335 336 .set noreorder 337 ld $s5,40($sp) # epilogue 338 ld $s4,32($sp) 339___ 340$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue 341 ld $s3,24($sp) 342 ld $s2,16($sp) 343 ld $s1,8($sp) 344 ld $s0,0($sp) 345___ 346$code.=<<___; 347 jr $ra 348 daddu $sp,6*8 349.end poly1305_blocks_internal 350___ 351} 352{ 353my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); 354 355$code.=<<___; 356.align 5 357.globl poly1305_emit 358.ent poly1305_emit 359poly1305_emit: 360 .frame $sp,0,$ra 361 .set reorder 362 363 ld $tmp0,0($ctx) 364 ld $tmp1,8($ctx) 365 ld $tmp2,16($ctx) 366 367 daddiu $in0,$tmp0,5 # compare to modulus 368 sltiu $tmp3,$in0,5 369 daddu $in1,$tmp1,$tmp3 370 sltu $tmp3,$in1,$tmp3 371 daddu $tmp2,$tmp2,$tmp3 372 373 dsrl $tmp2,2 # see if it carried/borrowed 374 dsubu $tmp2,$zero,$tmp2 375 nor $tmp3,$zero,$tmp2 376 377 and $in0,$tmp2 378 and $tmp0,$tmp3 379 and $in1,$tmp2 380 and $tmp1,$tmp3 381 or $in0,$tmp0 382 or $in1,$tmp1 383 384 lwu $tmp0,0($nonce) # load nonce 385 lwu $tmp1,4($nonce) 386 lwu $tmp2,8($nonce) 387 lwu $tmp3,12($nonce) 388 dsll $tmp1,32 389 dsll $tmp3,32 390 or $tmp0,$tmp1 391 or $tmp2,$tmp3 392 393 daddu $in0,$tmp0 # accumulate nonce 394 daddu $in1,$tmp2 395 sltu $tmp0,$in0,$tmp0 396 daddu $in1,$tmp0 397 398 dsrl $tmp0,$in0,8 # write mac value 399 dsrl $tmp1,$in0,16 400 dsrl $tmp2,$in0,24 401 sb $in0,0($mac) 402 dsrl $tmp3,$in0,32 403 sb $tmp0,1($mac) 404 dsrl $tmp0,$in0,40 405 sb $tmp1,2($mac) 406 dsrl $tmp1,$in0,48 407 sb $tmp2,3($mac) 408 dsrl $tmp2,$in0,56 409 sb $tmp3,4($mac) 410 dsrl $tmp3,$in1,8 411 sb $tmp0,5($mac) 412 dsrl $tmp0,$in1,16 413 sb $tmp1,6($mac) 414 dsrl $tmp1,$in1,24 415 sb $tmp2,7($mac) 416 417 sb $in1,8($mac) 418 dsrl $tmp2,$in1,32 419 sb $tmp3,9($mac) 420 dsrl $tmp3,$in1,40 421 sb $tmp0,10($mac) 422 dsrl $tmp0,$in1,48 423 sb $tmp1,11($mac) 424 dsrl $tmp1,$in1,56 425 sb $tmp2,12($mac) 426 sb $tmp3,13($mac) 427 sb $tmp0,14($mac) 428 sb $tmp1,15($mac) 429 430 jr $ra 431.end poly1305_emit 432.rdata 433.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>" 434.align 2 435___ 436} 437 438$output and open STDOUT,">$output"; 439print $code; 440close STDOUT or die "error closing STDOUT: $!"; 441 442