1#! /usr/bin/env perl 2# Author: Min Zhou <zhoumin@loongson.cn> 3# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. 4# 5# Licensed under the OpenSSL license (the "License"). You may not use 6# this file except in compliance with the License. You can obtain a copy 7# in the file LICENSE in the source distribution or at 8# https://www.openssl.org/source/license.html 9 10# Reference to crypto/md5/asm/md5-x86_64.pl 11# MD5 optimized for LoongArch. 12 13use strict; 14 15my $code; 16 17my ($zero,$ra,$tp,$sp,$fp)=map("\$r$_",(0..3,22)); 18my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$r$_",(4..11)); 19my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$x)=map("\$r$_",(12..21)); 20 21# $output is the last argument if it looks like a file (it has an extension) 22my $output; 23$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 24open STDOUT,">$output"; 25 26# round1_step() does: 27# dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s) 28# $t1 = y ^ z 29# $t2 = dst + X[k_next] 30sub round1_step 31{ 32 my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; 33 my $T_i_h = ($T_i & 0xfffff000) >> 12; 34 my $T_i_l = $T_i & 0xfff; 35 36# In LoongArch we have to use two instructions of lu12i.w and ori to load a 37# 32-bit immediate into a general register. Meanwhile, the instruction lu12i.w 38# treats the 20-bit immediate as a signed number. So if the T_i_h is greater 39# than or equal to (1<<19), we need provide lu12i.w a corresponding negative 40# number whose complement equals to the sign extension of T_i_h. 41 42# The details of the instruction lu12i.w can be found as following: 43# https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#_lu12i_w_lu32i_d_lu52i_d 44 45 $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19)); 46 47 $code .= " ld.w $t0,$a1,0 /* (NEXT STEP) X[0] */\n" if ($pos == -1); 48 $code .= " xor $t1,$y,$z /* y ^ z */\n" if ($pos == -1); 49 $code .= " add.w $t2,$dst,$t0 /* dst + X[k] */\n" if ($pos == -1); 50 $code .= <<EOF; 51 lu12i.w $t8,$T_i_h /* load bits [31:12] of constant */ 52 and $t1,$x,$t1 /* x & ... */ 53 ori $t8,$t8,$T_i_l /* load bits [11:0] of constant */ 54 xor $t1,$z,$t1 /* z ^ ... */ 55 add.w $t7,$t2,$t8 /* dst + X[k] + Const */ 56 ld.w $t0,$a1,$k_next*4 /* (NEXT STEP) X[$k_next] */ 57 add.w $dst,$t7,$t1 /* dst += ... */ 58 add.w $t2,$z,$t0 /* (NEXT STEP) dst + X[$k_next] */ 59EOF 60 61 $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n"; 62 if ($pos != 1) { 63 $code .= " xor $t1,$x,$y /* (NEXT STEP) y ^ z */\n"; 64 } else { 65 $code .= " move $t0,$a7 /* (NEXT ROUND) $t0 = z' (copy of z) */\n"; 66 $code .= " nor $t1,$zero,$a7 /* (NEXT ROUND) $t1 = not z' (copy of not z) */\n"; 67 } 68 $code .= " add.w $dst,$dst,$x /* dst += x */\n"; 69} 70 71# round2_step() does: 72# dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s) 73# $t0 = z' (copy of z for the next step) 74# $t1 = not z' (copy of not z for the next step) 75# $t2 = dst + X[k_next] 76sub round2_step 77{ 78 my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; 79 my $T_i_h = ($T_i & 0xfffff000) >> 12; 80 my $T_i_l = $T_i & 0xfff; 81 $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19)); 82 83 $code .= <<EOF; 84 lu12i.w $t8,$T_i_h /* load bits [31:12] of Constant */ 85 and $t0,$x,$t0 /* x & z */ 86 ori $t8,$t8,$T_i_l /* load bits [11:0] of Constant */ 87 and $t1,$y,$t1 /* y & (not z) */ 88 add.w $t7,$t2,$t8 /* dst + X[k] + Const */ 89 or $t1,$t0,$t1 /* (y & (not z)) | (x & z) */ 90 ld.w $t0,$a1,$k_next*4 /* (NEXT STEP) X[$k_next] */ 91 add.w $dst,$t7,$t1 /* dst += ... */ 92 add.w $t2,$z,$t0 /* (NEXT STEP) dst + X[$k_next] */ 93EOF 94 95 $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n"; 96 if ($pos != 1) { 97 $code .= " move $t0,$y /* (NEXT STEP) z' = $y */\n"; 98 $code .= " nor $t1,$zero,$y /* (NEXT STEP) not z' = not $y */\n"; 99 } else { 100 $code .= " xor $t1,$a6,$a7 /* (NEXT ROUND) $t1 = y ^ z */\n"; 101 } 102 $code .= " add.w $dst,$dst,$x /* dst += x */\n"; 103} 104 105# round3_step() does: 106# dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s) 107# $t1 = y ^ z 108# $t2 = dst + X[k_next] 109sub round3_step 110{ 111 my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; 112 my $T_i_h = ($T_i & 0xfffff000) >> 12; 113 my $T_i_l = $T_i & 0xfff; 114 $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19)); 115 116 $code .= <<EOF; 117 lu12i.w $t8,$T_i_h /* load bits [31:12] of Constant */ 118 xor $t1,$x,$t1 /* x ^ ... */ 119 ori $t8,$t8,$T_i_l /* load bits [11:0] of Constant */ 120 add.w $t7,$t2,$t8 /* dst + X[k] + Const */ 121 ld.w $t0,$a1,$k_next*4 /* (NEXT STEP) X[$k_next] */ 122 add.w $dst,$t7,$t1 /* dst += ... */ 123 add.w $t2,$z,$t0 /* (NEXT STEP) dst + X[$k_next] */ 124EOF 125 126 $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n"; 127 if ($pos != 1) { 128 $code .= " xor $t1,$x,$y /* (NEXT STEP) y ^ z */\n"; 129 } else { 130 $code .= " nor $t1,$zero,$a7 /* (NEXT ROUND) $t1 = not z */\n"; 131 } 132 $code .= " add.w $dst,$dst,$x /* dst += x */\n"; 133} 134 135# round4_step() does: 136# dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s) 137# $t1 = not z' (copy of not z for the next step) 138# $t2 = dst + X[k_next] 139sub round4_step 140{ 141 my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; 142 my $T_i_h = ($T_i & 0xfffff000) >> 12; 143 my $T_i_l = $T_i & 0xfff; 144 $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19)); 145 146 $code .= <<EOF; 147 lu12i.w $t8,$T_i_h /* load bits [31:12] of Constant */ 148 or $t1,$x,$t1 /* x | ... */ 149 ori $t8,$t8,$T_i_l /* load bits [11:0] of Constant */ 150 xor $t1,$y,$t1 /* y ^ ... */ 151 add.w $t7,$t2,$t8 /* dst + X[k] + Const */ 152EOF 153 154 if ($pos != 1) { 155 $code .= " ld.w $t0,$a1,$k_next*4 /* (NEXT STEP) X[$k_next] */\n"; 156 $code .= " add.w $dst,$t7,$t1 /* dst += ... */\n"; 157 $code .= " add.w $t2,$z,$t0 /* (NEXT STEP) dst + X[$k_next] */\n"; 158 $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n"; 159 $code .= " nor $t1,$zero,$y /* (NEXT STEP) not z' = not $y */\n"; 160 $code .= " add.w $dst,$dst,$x /* dst += x */\n"; 161 } else { 162 $code .= " add.w $a4,$t3,$a4 /* (NEXT LOOP) add old value of A */\n"; 163 $code .= " add.w $dst,$t7,$t1 /* dst += ... */\n"; 164 $code .= " add.w $a7,$t6,$a7 /* (NEXT LOOP) add old value of D */\n"; 165 $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n"; 166 $code .= " addi.d $a1,$a1,64 /* (NEXT LOOP) ptr += 64 */\n"; 167 $code .= " add.w $dst,$dst,$x /* dst += x */\n"; 168 } 169} 170 171$code .= <<EOF; 172.text 173 174.globl ossl_md5_block_asm_data_order 175.type ossl_md5_block_asm_data_order function 176ossl_md5_block_asm_data_order: 177 # $a0 = arg #1 (ctx, MD5_CTX pointer) 178 # $a1 = arg #2 (ptr, data pointer) 179 # $a2 = arg #3 (nbr, number of 16-word blocks to process) 180 beqz $a2,.Lend # cmp nbr with 0, jmp if nbr == 0 181 182 # ptr is '$a1' 183 # end is '$a3' 184 slli.d $t0,$a2,6 185 add.d $a3,$a1,$t0 186 187 # A is '$a4' 188 # B is '$a5' 189 # C is '$a6' 190 # D is '$a7' 191 ld.w $a4,$a0,0 # a4 = ctx->A 192 ld.w $a5,$a0,4 # a5 = ctx->B 193 ld.w $a6,$a0,8 # a6 = ctx->C 194 ld.w $a7,$a0,12 # a7 = ctx->D 195 196# BEGIN of loop over 16-word blocks 197.align 6 198.Lloop: 199 # save old values of A, B, C, D 200 move $t3,$a4 201 move $t4,$a5 202 move $t5,$a6 203 move $t6,$a7 204 205 preld 0,$a1,0 206 preld 0,$a1,64 207EOF 208 209round1_step(-1, $a4, $a5, $a6, $a7, '1', 0xd76aa478, '7'); 210round1_step(0, $a7, $a4, $a5, $a6, '2', 0xe8c7b756, '12'); 211round1_step(0, $a6, $a7, $a4, $a5, '3', 0x242070db, '17'); 212round1_step(0, $a5, $a6, $a7, $a4, '4', 0xc1bdceee, '22'); 213round1_step(0, $a4, $a5, $a6, $a7, '5', 0xf57c0faf, '7'); 214round1_step(0, $a7, $a4, $a5, $a6, '6', 0x4787c62a, '12'); 215round1_step(0, $a6, $a7, $a4, $a5, '7', 0xa8304613, '17'); 216round1_step(0, $a5, $a6, $a7, $a4, '8', 0xfd469501, '22'); 217round1_step(0, $a4, $a5, $a6, $a7, '9', 0x698098d8, '7'); 218round1_step(0, $a7, $a4, $a5, $a6, '10', 0x8b44f7af, '12'); 219round1_step(0, $a6, $a7, $a4, $a5, '11', 0xffff5bb1, '17'); 220round1_step(0, $a5, $a6, $a7, $a4, '12', 0x895cd7be, '22'); 221round1_step(0, $a4, $a5, $a6, $a7, '13', 0x6b901122, '7'); 222round1_step(0, $a7, $a4, $a5, $a6, '14', 0xfd987193, '12'); 223round1_step(0, $a6, $a7, $a4, $a5, '15', 0xa679438e, '17'); 224round1_step(1, $a5, $a6, $a7, $a4, '1', 0x49b40821, '22'); 225 226round2_step(-1, $a4, $a5, $a6, $a7, '6', 0xf61e2562, '5'); 227round2_step(0, $a7, $a4, $a5, $a6, '11', 0xc040b340, '9'); 228round2_step(0, $a6, $a7, $a4, $a5, '0', 0x265e5a51, '14'); 229round2_step(0, $a5, $a6, $a7, $a4, '5', 0xe9b6c7aa, '20'); 230round2_step(0, $a4, $a5, $a6, $a7, '10', 0xd62f105d, '5'); 231round2_step(0, $a7, $a4, $a5, $a6, '15', 0x2441453, '9'); 232round2_step(0, $a6, $a7, $a4, $a5, '4', 0xd8a1e681, '14'); 233round2_step(0, $a5, $a6, $a7, $a4, '9', 0xe7d3fbc8, '20'); 234round2_step(0, $a4, $a5, $a6, $a7, '14', 0x21e1cde6, '5'); 235round2_step(0, $a7, $a4, $a5, $a6, '3', 0xc33707d6, '9'); 236round2_step(0, $a6, $a7, $a4, $a5, '8', 0xf4d50d87, '14'); 237round2_step(0, $a5, $a6, $a7, $a4, '13', 0x455a14ed, '20'); 238round2_step(0, $a4, $a5, $a6, $a7, '2', 0xa9e3e905, '5'); 239round2_step(0, $a7, $a4, $a5, $a6, '7', 0xfcefa3f8, '9'); 240round2_step(0, $a6, $a7, $a4, $a5, '12', 0x676f02d9, '14'); 241round2_step(1, $a5, $a6, $a7, $a4, '5', 0x8d2a4c8a, '20'); 242 243round3_step(-1, $a4, $a5, $a6, $a7, '8', 0xfffa3942, '4'); 244round3_step(0, $a7, $a4, $a5, $a6, '11', 0x8771f681, '11'); 245round3_step(0, $a6, $a7, $a4, $a5, '14', 0x6d9d6122, '16'); 246round3_step(0, $a5, $a6, $a7, $a4, '1', 0xfde5380c, '23'); 247round3_step(0, $a4, $a5, $a6, $a7, '4', 0xa4beea44, '4'); 248round3_step(0, $a7, $a4, $a5, $a6, '7', 0x4bdecfa9, '11'); 249round3_step(0, $a6, $a7, $a4, $a5, '10', 0xf6bb4b60, '16'); 250round3_step(0, $a5, $a6, $a7, $a4, '13', 0xbebfbc70, '23'); 251round3_step(0, $a4, $a5, $a6, $a7, '0', 0x289b7ec6, '4'); 252round3_step(0, $a7, $a4, $a5, $a6, '3', 0xeaa127fa, '11'); 253round3_step(0, $a6, $a7, $a4, $a5, '6', 0xd4ef3085, '16'); 254round3_step(0, $a5, $a6, $a7, $a4, '9', 0x4881d05, '23'); 255round3_step(0, $a4, $a5, $a6, $a7, '12', 0xd9d4d039, '4'); 256round3_step(0, $a7, $a4, $a5, $a6, '15', 0xe6db99e5, '11'); 257round3_step(0, $a6, $a7, $a4, $a5, '2', 0x1fa27cf8, '16'); 258round3_step(1, $a5, $a6, $a7, $a4, '0', 0xc4ac5665, '23'); 259 260round4_step(-1, $a4, $a5, $a6, $a7, '7', 0xf4292244, '6'); 261round4_step(0, $a7, $a4, $a5, $a6, '14', 0x432aff97, '10'); 262round4_step(0, $a6, $a7, $a4, $a5, '5', 0xab9423a7, '15'); 263round4_step(0, $a5, $a6, $a7, $a4, '12', 0xfc93a039, '21'); 264round4_step(0, $a4, $a5, $a6, $a7, '3', 0x655b59c3, '6'); 265round4_step(0, $a7, $a4, $a5, $a6, '10', 0x8f0ccc92, '10'); 266round4_step(0, $a6, $a7, $a4, $a5, '1', 0xffeff47d, '15'); 267round4_step(0, $a5, $a6, $a7, $a4, '8', 0x85845dd1, '21'); 268round4_step(0, $a4, $a5, $a6, $a7, '15', 0x6fa87e4f, '6'); 269round4_step(0, $a7, $a4, $a5, $a6, '6', 0xfe2ce6e0, '10'); 270round4_step(0, $a6, $a7, $a4, $a5, '13', 0xa3014314, '15'); 271round4_step(0, $a5, $a6, $a7, $a4, '4', 0x4e0811a1, '21'); 272round4_step(0, $a4, $a5, $a6, $a7, '11', 0xf7537e82, '6'); 273round4_step(0, $a7, $a4, $a5, $a6, '2', 0xbd3af235, '10'); 274round4_step(0, $a6, $a7, $a4, $a5, '9', 0x2ad7d2bb, '15'); 275round4_step(1, $a5, $a6, $a7, $a4, '0', 0xeb86d391, '21'); 276 277$code .= <<EOF; 278 # add old values of B, C 279 add.w $a5,$t4,$a5 280 add.w $a6,$t5,$a6 281 282 bltu $a1,$a3,.Lloop # jmp if ptr < end 283 284 st.w $a4,$a0,0 # ctx->A = A 285 st.w $a5,$a0,4 # ctx->B = B 286 st.w $a6,$a0,8 # ctx->C = C 287 st.w $a7,$a0,12 # ctx->D = D 288 289.Lend: 290 jr $ra 291.size ossl_md5_block_asm_data_order,.-ossl_md5_block_asm_data_order 292EOF 293 294$code =~ s/\`([^\`]*)\`/eval($1)/gem; 295 296print $code; 297 298close STDOUT; 299