1#! /usr/bin/env perl 2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# February 2012 18# 19# The module implements bn_GF2m_mul_2x2 polynomial multiplication 20# used in bn_gf2m.c. It's kind of low-hanging mechanical port from 21# C for the time being... The subroutine runs in 37 cycles, which is 22# 4.5x faster than compiler-generated code. Though comparison is 23# totally unfair, because this module utilizes Galois Field Multiply 24# instruction. 25 26$output = pop and open STDOUT,">$output"; 27 28($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector 29 30($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20)); 31($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20)); 32($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7"); 33($A,$B)=($Alo,$B_1); 34$xFF="B1"; 35 36sub mul_1x1_upper { 37my ($A,$B)=@_; 38$code.=<<___; 39 EXTU $B,8,24,$B_2 ; smash $B to 4 bytes 40|| AND $B,$xFF,$B_0 41|| SHRU $B,24,$B_3 42 SHRU $A,16, $Ahi ; smash $A to two halfwords 43|| EXTU $A,16,16,$Alo 44 45 XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits multiplication 46|| XORMPY $Ahi,$B_2,$Ahix2 47|| EXTU $B,16,24,$B_1 48 XORMPY $Alo,$B_0,$Alox0 49|| XORMPY $Ahi,$B_0,$Ahix0 50 XORMPY $Alo,$B_3,$Alox3 51|| XORMPY $Ahi,$B_3,$Ahix3 52 XORMPY $Alo,$B_1,$Alox1 53|| XORMPY $Ahi,$B_1,$Ahix1 54___ 55} 56sub mul_1x1_merged { 57my ($OUTlo,$OUThi,$A,$B)=@_; 58$code.=<<___; 59 EXTU $B,8,24,$B_2 ; smash $B to 4 bytes 60|| AND $B,$xFF,$B_0 61|| SHRU $B,24,$B_3 62 SHRU $A,16, $Ahi ; smash $A to two halfwords 63|| EXTU $A,16,16,$Alo 64 65 XOR $Ahix0,$Alox2,$Ahix0 66|| MV $Ahix2,$OUThi 67|| XORMPY $Alo,$B_2,$Alox2 68 XORMPY $Ahi,$B_2,$Ahix2 69|| EXTU $B,16,24,$B_1 70|| XORMPY $Alo,$B_0,A1 ; $Alox0 71 XOR $Ahix1,$Alox3,$Ahix1 72|| SHL $Ahix0,16,$OUTlo 73|| SHRU $Ahix0,16,$Ahix0 74 XOR $Alox0,$OUTlo,$OUTlo 75|| XOR $Ahix0,$OUThi,$OUThi 76|| XORMPY $Ahi,$B_0,$Ahix0 77|| XORMPY $Alo,$B_3,$Alox3 78|| SHL $Alox1,8,$Alox1 79|| SHL $Ahix3,8,$Ahix3 80 XOR $Alox1,$OUTlo,$OUTlo 81|| XOR $Ahix3,$OUThi,$OUThi 82|| XORMPY $Ahi,$B_3,$Ahix3 83|| SHL $Ahix1,24,$Alox1 84|| SHRU $Ahix1,8, $Ahix1 85 XOR $Alox1,$OUTlo,$OUTlo 86|| XOR $Ahix1,$OUThi,$OUThi 87|| XORMPY $Alo,$B_1,$Alox1 88|| XORMPY $Ahi,$B_1,$Ahix1 89|| MV A1,$Alox0 90___ 91} 92sub mul_1x1_lower { 93my ($OUTlo,$OUThi)=@_; 94$code.=<<___; 95 ;NOP 96 XOR $Ahix0,$Alox2,$Ahix0 97|| MV $Ahix2,$OUThi 98 NOP 99 XOR $Ahix1,$Alox3,$Ahix1 100|| SHL $Ahix0,16,$OUTlo 101|| SHRU $Ahix0,16,$Ahix0 102 XOR $Alox0,$OUTlo,$OUTlo 103|| XOR $Ahix0,$OUThi,$OUThi 104|| SHL $Alox1,8,$Alox1 105|| SHL $Ahix3,8,$Ahix3 106 XOR $Alox1,$OUTlo,$OUTlo 107|| XOR $Ahix3,$OUThi,$OUThi 108|| SHL $Ahix1,24,$Alox1 109|| SHRU $Ahix1,8, $Ahix1 110 XOR $Alox1,$OUTlo,$OUTlo 111|| XOR $Ahix1,$OUThi,$OUThi 112___ 113} 114$code.=<<___; 115 .text 116 117 .if .ASSEMBLER_VERSION<7000000 118 .asg 0,__TI_EABI__ 119 .endif 120 .if __TI_EABI__ 121 .asg bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2 122 .endif 123 124 .global _bn_GF2m_mul_2x2 125_bn_GF2m_mul_2x2: 126 .asmfunc 127 MVK 0xFF,$xFF 128___ 129 &mul_1x1_upper($a0,$b0); # a0·b0 130$code.=<<___; 131|| MV $b1,$B 132 MV $a1,$A 133___ 134 &mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1 135$code.=<<___; 136|| XOR $b0,$b1,$B 137 XOR $a0,$a1,$A 138___ 139 &mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1) 140$code.=<<___; 141 XOR A28,A31,A29 142|| XOR B28,B31,B29 ; a0·b0+a1·b1 143___ 144 &mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1) 145$code.=<<___; 146|| BNOP B3 147 XOR A29,A30,A30 148|| XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1 149 XOR B28,A30,A30 150|| STW A28,*${rp}[0] 151 XOR B30,A31,A31 152|| STW A30,*${rp}[1] 153 STW A31,*${rp}[2] 154 STW B31,*${rp}[3] 155 .endasmfunc 156___ 157 158print $code; 159close STDOUT or die "error closing STDOUT: $!"; 160