xref: /openssl/crypto/aes/asm/vpaes-loongarch64.pl (revision da1c088f)
1#! /usr/bin/env perl
2# Copyright 2015-2023 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9######################################################################
10## Constant-time SSSE3 AES core implementation.
11## version 0.1
12##
13## By Mike Hamburg (Stanford University), 2009
14## Public domain.
15##
16## For details see http://shiftleft.org/papers/vector_aes/ and
17## http://crypto.stanford.edu/vpaes/.
18##
19######################################################################
20
21# Loongarch64 LSX adaptation by <zhuchen@loongson.cn>,
22# <lujingfeng@loongson.cn> and <shichenlong@loongson.cn>
23#
24
25($zero,$ra,$tp,$sp)=map("\$r$_",(0..3));
26($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$r$_",(4..11));
27($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9)=map("\$r$_",(12..21));
28($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$r$_",(23..30));
29($vr0,$vr1,$vr2,$vr3,$vr4,$vr5,$vr6,$vr7,$vr8,$vr9,$vr10,$vr11,$vr12,$vr13,$vr14,$vr15,$vr16,$vr17,$vr18,$vr19)=map("\$vr$_",(0..19));
30($fp)=map("\$r$_",(22));
31
32for (@ARGV) {   $output=$_ if (/\w[\w\-]*\.\w+$/);      }
33open STDOUT,">$output";
34while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
35open STDOUT,">$output";
36
37$PREFIX="vpaes";
38
39$code.=<<___;
40
41##
42##  _aes_encrypt_core
43##
44##  AES-encrypt %vr0.
45##
46##  Inputs:
47##     %vr0 = input
48##     %vr9-%vr15 as in _vpaes_preheat
49##    (%a2) = scheduled keys
50##
51##  Output in %vr0
52##  Clobbers  %vr1-%vr5, %r9, %r10, %r11, %t5
53##  Preserves %vr6 - %vr8 so you get some local vectors
54##
55##
56##.type	_vpaes_encrypt_core
57.align 4
58_vpaes_encrypt_core:
59.cfi_startproc
60    move      $a5,$a2
61    li.d      $a7,0x10
62    ld.w      $t5,$a2,240
63    vori.b    $vr1,$vr9,0
64    la.local  $t0,Lk_ipt
65    vld       $vr2,$t0,0    # iptlo
66    vandn.v   $vr1,$vr1,$vr0
67    vld	      $vr5,$a5,0    # round0 key
68    vsrli.w   $vr1,$vr1,4
69    vand.v    $vr0,$vr0,$vr9
70    vshuf.b   $vr2,$vr18,$vr2,$vr0
71    vld       $vr0,$t0,16   # ipthi
72    vshuf.b   $vr0,$vr18,$vr0,$vr1
73    vxor.v    $vr2,$vr2,$vr5
74    addi.d    $a5,$a5,16
75    vxor.v    $vr0,$vr0,$vr2
76    la.local  $a6,Lk_mc_backward
77    b         .Lenc_entry
78
79.align 4
80.Lenc_loop:
81    # middle of middle round
82    vori.b    $vr4,$vr13,0           # 4 : sb1u
83    vori.b    $vr0,$vr12,0           # 0 : sb1t
84    vshuf.b   $vr4,$vr18,$vr4,$vr2    # 4 = sb1u
85    vshuf.b   $vr0,$vr18,$vr0,$vr3    # 0 = sb1t
86    vxor.v    $vr4,$vr4,$vr5         # 4 = sb1u + k
87    vori.b    $vr5,$vr15,0           # 4 : sb2u
88    vxor.v    $vr0,$vr0,$vr4         # 0 = A
89    add.d     $t0,$a7,$a6            # Lk_mc_forward[]
90    vld       $vr1,$t0,-0x40
91    vshuf.b   $vr5,$vr18,$vr5,$vr2    # 4 = sb2u
92    vld       $vr4,$t0,0             # Lk_mc_backward[]
93    vori.b    $vr2,$vr14,0           # 2 : sb2t
94    vshuf.b   $vr2,$vr18,$vr2,$vr3    # 2 = sb2t
95    vori.b    $vr3,$vr0,0            # 3 = A
96    vxor.v    $vr2,$vr5,$vr2         # 2 = 2A
97    vshuf.b   $vr0,$vr18,$vr0,$vr1    # 0 = B
98    addi.d    $a5,$a5,16             # next key
99    vxor.v    $vr0,$vr0,$vr2         # 0 = 2A+B
100    vshuf.b   $vr3,$vr18,$vr3,$vr4    # 3 = D
101    addi.d    $a7,$a7,16             # next mc
102    vxor.v    $vr3,$vr3,$vr0         # 3 = 2A+B+D
103    vshuf.b   $vr0,$vr18,$vr0,$vr1    # 0 = 2B+C
104    andi      $a7,$a7,0x30           # ... mod 4
105    addi.d    $t5,$t5,-1             # nr--
106    vxor.v    $vr0,$vr0,$vr3         # 0 = 2A+3B+C+D
107
108.Lenc_entry:
109    # top of round
110    vori.b    $vr1,$vr9,0           # 1 : i
111    vori.b    $vr5,$vr11,0          # 2 : a/k
112    vandn.v   $vr1,$vr1,$vr0        # 1 = i<<4
113    vsrli.w   $vr1,$vr1,4           # 1 = i
114    vand.v    $vr0,$vr0,$vr9        # 0 = k
115    vshuf.b   $vr5,$vr18,$vr5,$vr0   # 2 = a/k
116    vori.b    $vr3,$vr10,0          # 3 : 1/i
117    vxor.v    $vr0,$vr0,$vr1        # 0 = j
118    vshuf.b   $vr3,$vr18,$vr3,$vr1   # 3 = 1/i
119    vori.b    $vr4,$vr10,0          # 4 : 1/j
120    vxor.v    $vr3,$vr3,$vr5        # 3 = iak = 1/i + a/k
121    vshuf.b   $vr4,$vr18,$vr4,$vr0   # 4 = 1/j
122    vori.b    $vr2,$vr10,0          # 2 : 1/iak
123    vxor.v    $vr4,$vr4,$vr5        # 4 = jak = 1/j + a/k
124    vshuf.b   $vr2,$vr18,$vr2,$vr3   # 2 = 1/iak
125    vori.b    $vr3,$vr10,0          # 3 : 1/jak
126    vxor.v    $vr2,$vr2,$vr0        # 2 = io
127    vshuf.b   $vr3,$vr18,$vr3,$vr4   # 3 = 1/jak
128    vld       $vr5,$a5,0
129    vxor.v    $vr3,$vr3,$vr1        # 3 = jo
130    bnez      $t5,.Lenc_loop
131
132    # middle of last round
133    vld       $vr4,$a6,	-0x60		# 3 : sbou	Lk_sbo
134    vld       $vr0,$a6,	-0x50		# 0 : sbot	Lk_sbo+16
135    vshuf.b   $vr4,$vr18,$vr4,$vr2	# 4 = sbou
136    vxor.v    $vr4,$vr4,$vr5		# 4 = sb1u + k
137    vshuf.b   $vr0,$vr18,$vr0,$vr3	# 0 = sb1t
138    add.d     $t0,$a7,$a6		# Lk_sr[]
139    vld       $vr1,$t0,0x40
140    vxor.v    $vr0,$vr0,$vr4		# 0 = A
141    vshuf.b   $vr0,$vr18,$vr0,$vr1
142    jr        $ra
143.cfi_endproc
144.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
145
146##
147##  Decryption core
148##
149##  Same API as encryption core.
150##
151#.type	_vpaes_decrypt_core,\@abi-omnipotent
152.align	4
153_vpaes_decrypt_core:
154.cfi_startproc
155    move      $a5,$a2                  # load key
156    ld.w      $t5,$a2,240
157    vori.b    $vr1,$vr9,0
158    la.local  $t0,Lk_dipt
159    vld       $vr2,$t0,0               # iptlo
160    vandn.v   $vr1,$vr1,$vr0
161    move      $a7,$t5
162    vsrli.w   $vr1,$vr1,4
163    vld       $vr5,$a5,0               # round0 key
164    slli.d    $a7,$a7,4
165    vand.v    $vr0,$vr9,$vr0
166    vshuf.b   $vr2,$vr18,$vr2,$vr0
167    vld       $vr0,$t0,16              # ipthi
168    xori      $a7,$a7,0x30
169    la.local  $a6,Lk_dsbd
170    vshuf.b   $vr0,$vr18,$vr0,$vr1
171    andi      $a7,$a7,0x30
172    vxor.v    $vr2,$vr2,$vr5
173    la.local  $t0,Lk_mc_forward
174    vld       $vr5,$t0,48
175    vxor.v    $vr0,$vr0,$vr2
176    addi.d    $a5,$a5,16
177    add.d     $a7,$a7,$a6
178    b         .Ldec_entry
179
180.align 4
181.Ldec_loop:
182##
183##  Inverse mix columns
184##
185    vld        $vr4,$a6,-0x20		# 4 : sb9u
186    vld        $vr1,$a6,-0x10		# 0 : sb9t
187    vshuf.b    $vr4,$vr18,$vr4,$vr2	# 4 = sb9u
188    vshuf.b    $vr1,$vr18,$vr1,$vr3	# 0 = sb9t
189    vxor.v     $vr0,$vr0,$vr4
190    vld        $vr4,$a6,0x0		# 4 : sbdu
191    vxor.v     $vr0,$vr0,$vr1		# 0 = ch
192    vld        $vr1,$a6,0x10		# 0 : sbdt
193    vshuf.b    $vr4,$vr18,$vr4,$vr2	# 4 = sbdu
194    vshuf.b    $vr0,$vr18,$vr0,$vr5	# MC ch
195    vshuf.b    $vr1,$vr18,$vr1,$vr3	# 0 = sbdt
196    vxor.v     $vr0,$vr0,$vr4		# 4 = ch
197    vld        $vr4,$a6,0x20		# 4 : sbbu
198    vxor.v     $vr0,$vr0,$vr1		# 0 = ch
199    vld        $vr1,$a6,0x30		# 0 : sbbt
200    vshuf.b    $vr4,$vr18,$vr4,$vr2	# 4 = sbbu
201    vshuf.b    $vr0,$vr18,$vr0,$vr5	# MC ch
202    vshuf.b    $vr1,$vr18,$vr1,$vr3	# 0 = sbbt
203    vxor.v     $vr0,$vr0,$vr4		# 4 = ch
204    vld        $vr4,$a6,0x40		# 4 : sbeu
205    vxor.v     $vr0,$vr0,$vr1		# 0 = ch
206    vld        $vr1,$a6,0x50		# 0 : sbet
207    vshuf.b    $vr4,$vr18,$vr4,$vr2	# 4 = sbeu
208    vshuf.b    $vr0,$vr18,$vr0,$vr5	# MC ch
209    vshuf.b    $vr1,$vr18,$vr1,$vr3	# 0 = sbet
210    vxor.v     $vr0,$vr0,$vr4		# 4 = ch
211    addi.d     $a5,$a5,	16		# next round key
212    vbsrl.v    $vr16,$vr5,0xc
213    vbsll.v    $vr5,$vr5,0x4
214    vor.v      $vr5,$vr5,$vr16
215    vxor.v     $vr0,$vr0,$vr1		# 0 = ch
216    addi.d     $t5,$t5,-1		# nr--
217
218.Ldec_entry:
219    # top of round
220    vori.b     $vr1,$vr9,0		# 1 : i
221    vandn.v    $vr1,$vr1,$vr0		# 1 = i<<4
222    vori.b     $vr2,$vr11,0		# 2 : a/k
223    vsrli.w    $vr1,$vr1,4		# 1 = i
224    vand.v     $vr0,$vr0,$vr9		# 0 = k
225    vshuf.b    $vr2,$vr18,$vr2,$vr0	# 2 = a/k
226    vori.b     $vr3,$vr10,0		# 3 : 1/i
227    vxor.v     $vr0,$vr0,$vr1		# 0 = j
228    vshuf.b    $vr3,$vr18,$vr3,$vr1	# 3 = 1/i
229    vori.b     $vr4,$vr10,0		# 4 : 1/j
230    vxor.v     $vr3,$vr3,$vr2		# 3 = iak = 1/i + a/k
231    vshuf.b    $vr4,$vr18,$vr4,$vr0	# 4 = 1/j
232    vxor.v     $vr4,$vr4,$vr2		# 4 = jak = 1/j + a/k
233    vori.b     $vr2,$vr10,0		# 2 : 1/iak
234    vshuf.b    $vr2,$vr18,$vr2,$vr3	# 2 = 1/iak
235    vori.b     $vr3,$vr10,0		# 3 : 1/jak
236    vxor.v     $vr2,$vr2,$vr0		# 2 = io
237    vshuf.b    $vr3,$vr18,$vr3,$vr4	# 3 = 1/jak
238    vld        $vr0,$a5,0
239    vxor.v     $vr3,$vr3,$vr1		# 3 = jo
240    bnez       $t5,.Ldec_loop
241
242    # middle of last round
243    vld        $vr4,$a6,0x60		# 3 : sbou
244    vshuf.b    $vr4,$vr18,$vr4,$vr2	# 4 = sbou
245    vxor.v     $vr4,$vr4,$vr0		# 4 = sb1u + k
246    vld        $vr0,$a6,0x70		# 0 : sbot
247    vld        $vr2,$a7,-0x160		# Lk_sr-.Lk_dsbd=-0x160
248    vshuf.b    $vr0,$vr18,$vr0,$vr3	# 0 = sb1t
249    vxor.v     $vr0,$vr0,$vr4		# 0 = A
250    vshuf.b    $vr0,$vr18,$vr0,$vr2
251    jr         $ra
252.cfi_endproc
253.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
254
255########################################################
256##                                                    ##
257##                  AES key schedule                  ##
258##                                                    ##
259########################################################
260#.type	_vpaes_schedule_core,\@abi-omnipotent
261.align	4
262_vpaes_schedule_core:
263.cfi_startproc
264    # a0 = key
265    # a1 = size in bits
266    # a2 = buffer
267    # a3 = direction.  0=encrypt, 1=decrypt
268
269    addi.d    $sp,$sp,-48
270    st.d      $ra,$sp,40
271    st.d      $fp,$sp,32
272
273    bl	_vpaes_preheat			# load the tables
274    la.local  $t0,Lk_rcon
275    vld       $vr8,$t0,0		# load rcon
276    vld       $vr0,$a0,0		# load key (unaligned)
277
278    # input transform
279    vori.b    $vr3,$vr0,0
280    la.local  $a7,Lk_ipt
281    bl	      _vpaes_schedule_transform
282    vori.b    $vr7,$vr0,0
283
284    la.local  $a6,Lk_sr
285    bnez      $a3,.Lschedule_am_decrypting
286
287    # encrypting, output zeroth round key after transform
288    vst       $vr0,$a2,0
289    b         .Lschedule_go
290
291.Lschedule_am_decrypting:
292    # decrypting, output zeroth round key after shiftrows
293    add.d     $t2,$a4,$a6
294    vld       $vr1,$t2,0
295    vshuf.b   $vr3,$vr18,$vr3,$vr1
296    vst       $vr3,$a2,0
297    xori      $a4,$a4,0x30
298
299.Lschedule_go:
300    li.d      $t6,192
301    bltu      $t6,$a1,.Lschedule_256
302    beq       $t6,$a1,.Lschedule_192
303    # 128: fall though
304
305##
306##  .schedule_128
307##
308##  128-bit specific part of key schedule.
309##
310##  This schedule is really simple, because all its parts
311##  are accomplished by the subroutines.
312##
313.Lschedule_128:
314     li.w    $a1,10
315
316.Loop_schedule_128:
317     bl      _vpaes_schedule_round
318     addi.w  $a1,$a1,-1
319     beqz    $a1,.Lschedule_mangle_last
320     bl      _vpaes_schedule_mangle
321     b       .Loop_schedule_128
322
323##
324##  .aes_schedule_192
325##
326##  192-bit specific part of key schedule.
327##
328##  The main body of this schedule is the same as the 128-bit
329##  schedule, but with more smearing.  The long, high side is
330##  stored in %vr7 as before, and the short, low side is in
331##  the high bits of %vr6.
332##
333##  This schedule is somewhat nastier, however, because each
334##  round produces 192 bits of key material, or 1.5 round keys.
335##  Therefore, on each cycle we do 2 rounds and produce 3 round
336##  keys.
337##
338.align	4
339.Lschedule_192:
340     vld        $vr0,$a0,8			#load key part 2
341     bl        	_vpaes_schedule_transform	#input transform
342     vaddi.du   $vr6,$vr0,0x0    		#save short part
343     vxor.v     $vr4,$vr4,$vr4			#clear 4
344     vpackod.d  $vr6,$vr6,$vr4			#clobber low side with zeros
345     li.w       $a1,4
346
347.Loop_schedule_192:
348     bl         _vpaes_schedule_round
349     vbsrl.v    $vr16,$vr6,0x8
350     vbsll.v    $vr0,$vr0,0x8
351     vor.v      $vr0,$vr0,$vr16
352
353     bl         _vpaes_schedule_mangle  	# save key n
354     bl         _vpaes_schedule_192_smear
355     bl         _vpaes_schedule_mangle		# save key n+1
356     bl         _vpaes_schedule_round
357     addi.w     $a1,$a1,-1
358     beqz       $a1,.Lschedule_mangle_last
359     bl         _vpaes_schedule_mangle		# save key n+2
360     bl         _vpaes_schedule_192_smear
361     b          .Loop_schedule_192
362
363##
364##  .aes_schedule_256
365##
366##  256-bit specific part of key schedule.
367##
368##  The structure here is very similar to the 128-bit
369##  schedule, but with an additional "low side" in
370##  %vr6.  The low side's rounds are the same as the
371##  high side's, except no rcon and no rotation.
372##
373.align	4
374.Lschedule_256:
375     vld        $vr0,$a0,16		        # load key part 2 (unaligned)
376     bl         _vpaes_schedule_transform	# input transform
377     addi.w     $a1,$zero,7
378
379.Loop_schedule_256:
380     bl         _vpaes_schedule_mangle          # output low result
381     vori.b     $vr6,$vr0,0		        # save cur_lo in vr6
382
383     # high round
384     bl         _vpaes_schedule_round
385     addi.d     $a1,$a1,-1
386     beqz       $a1,.Lschedule_mangle_last
387     bl         _vpaes_schedule_mangle
388
389     # low round. swap vr7 and vr6
390     vshuf4i.w  $vr0,$vr0,0xFF
391     vori.b     $vr5,$vr7,0
392     vori.b     $vr7,$vr6,0
393     bl         _vpaes_schedule_low_round
394     vori.b     $vr7,$vr5,0
395
396     b          .Loop_schedule_256
397
398
399##
400##  .aes_schedule_mangle_last
401##
402##  Mangler for last round of key schedule
403##  Mangles %vr0
404##    when encrypting, outputs out(%vr0) ^ 63
405##    when decrypting, outputs unskew(%vr0)
406##
407##  Always called right before return... jumps to cleanup and exits
408##
409.align	4
410.Lschedule_mangle_last:
411     # schedule last round key from vr0
412     la.local   $a7,Lk_deskew			# prepare to deskew
413     bnez       $a3,.Lschedule_mangle_last_dec
414
415     # encrypting
416     add.d      $t0,$a4,$a6
417     vld        $vr1,$t0,0
418     vshuf.b    $vr0,$vr18,$vr0,$vr1             # output permute
419     la.local   $a7,Lk_opt                      # prepare to output transform
420     addi.d     $a2,$a2,32
421
422.Lschedule_mangle_last_dec:
423     addi.d     $a2,$a2,-16
424     la.local   $t0,Lk_s63
425     vld        $vr16,$t0,0
426     vxor.v     $vr0,$vr0,$vr16
427     bl         _vpaes_schedule_transform	# output transform
428     vst        $vr0,$a2,0			# save last key
429
430     # cleanup
431     vxor.v     $vr0,$vr0,$vr0
432     vxor.v     $vr1,$vr1,$vr1
433     vxor.v     $vr2,$vr2,$vr2
434     vxor.v     $vr3,$vr3,$vr3
435     vxor.v     $vr4,$vr4,$vr4
436     vxor.v     $vr5,$vr5,$vr5
437     vxor.v     $vr6,$vr6,$vr6
438     vxor.v     $vr7,$vr7,$vr7
439     ld.d       $ra,$sp,40
440     ld.d       $fp,$sp,32
441     addi.d     $sp,$sp,48
442     jr         $ra
443.cfi_endproc
444.size	_vpaes_schedule_core,.-_vpaes_schedule_core
445
446##
447##  .aes_schedule_192_smear
448##
449##  Smear the short, low side in the 192-bit key schedule.
450##
451##  Inputs:
452##    %vr7: high side, b  a  x  y
453##    %vr6:  low side, d  c  0  0
454##    %vr13: 0
455##
456##  Outputs:
457##    %vr6: b+c+d  b+c  0  0
458##    %vr0: b+c+d  b+c  b  a
459##
460#.type	_vpaes_schedule_192_smear,\@abi-omnipotent
461.align	4
462_vpaes_schedule_192_smear:
463.cfi_startproc
464    vshuf4i.w   $vr1,$vr6,0x80	# d c 0 0 -> c 0 0 0
465    vshuf4i.w   $vr0,$vr7,0xFE	# b a _ _ -> b b b a
466    vxor.v      $vr6,$vr6,$vr1	# -> c+d c 0 0
467    vxor.v      $vr1,$vr1,$vr1
468    vxor.v      $vr6,$vr6,$vr0	# -> b+c+d b+c b a
469    vori.b      $vr0,$vr6,0
470    vilvh.d     $vr6,$vr6,$vr1	# clobber low side with zeros
471    jr		$ra
472.cfi_endproc
473.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
474
475##
476##  .aes_schedule_round
477##
478##  Runs one main round of the key schedule on %vr0, %vr7
479##
480##  Specifically, runs subbytes on the high dword of %vr0
481##  then rotates it by one byte and xors into the low dword of
482##  %vr7.
483##
484##  Adds rcon from low byte of %vr8, then rotates %vr8 for
485##  next rcon.
486##
487##  Smears the dwords of %vr7 by xoring the low into the
488##  second low, result into third, result into highest.
489##
490##  Returns results in %vr7 = %vr0.
491##  Clobbers %vr1-%vr4, %a7.
492##
493#.type	_vpaes_schedule_round,\@abi-omnipotent
494.align	4
495_vpaes_schedule_round:
496.cfi_startproc
497    # extract rcon from vr8
498    vxor.v      $vr1,$vr1,$vr1
499    vbsrl.v     $vr16,$vr8,0xf
500    vbsll.v     $vr1,$vr1,0x1
501    vor.v       $vr1,$vr1,$vr16
502    vbsrl.v     $vr16,$vr8,0xf
503    vbsll.v     $vr8,$vr8,0x1
504    vor.v       $vr8,$vr8,$vr16
505
506    vxor.v      $vr7,$vr7,$vr1
507
508    # rotate
509    vshuf4i.w   $vr0,$vr0,0xff  		#put $vr0 lowest 32 bit to each words
510    vbsrl.v     $vr16,$vr0,0x1
511    vbsll.v     $vr0,$vr0,0xf
512    vor.v       $vr0,$vr0,$vr16
513
514    # fall through...
515
516    # low round: same as high round, but no rotation and no rcon.
517_vpaes_schedule_low_round:
518    # smear vr7
519    vaddi.du    $vr1,$vr7,0x0
520    vbsll.v     $vr7,$vr7,0x4
521    vxor.v      $vr7,$vr7,$vr1
522    vaddi.du    $vr1,$vr7,0x0
523    vbsll.v     $vr7,$vr7,0x8
524    vxor.v      $vr7,$vr7,$vr1
525    vxori.b     $vr7,$vr7,0x5B
526
527    # subbytes
528    vaddi.du    $vr1,$vr9,0x0
529    vandn.v     $vr1,$vr1,$vr0
530    vsrli.w     $vr1,$vr1,0x4			# 1 = i
531    vand.v      $vr0,$vr0,$vr9			# 0 = k
532    vaddi.du    $vr2,$vr11,0x0			# 2 : a/k
533    vshuf.b     $vr2,$vr18,$vr2,$vr0		# 2 = a/k
534    vxor.v      $vr0,$vr0,$vr1			# 0 = j
535    vaddi.du    $vr3,$vr10,0x0			# 3 : 1/i
536    vshuf.b     $vr3,$vr18,$vr3,$vr1		# 3 = 1/i
537    vxor.v      $vr3,$vr3,$vr2			# 3 = iak = 1/i + a/k
538    vaddi.du    $vr4,$vr10,0x0			# 4 : 1/j
539    vshuf.b     $vr4,$vr18,$vr4,$vr0		# 4 = 1/j
540    vxor.v      $vr4,$vr4,$vr2			# 4 = jak = 1/j + a/k
541    vaddi.du    $vr2,$vr10,0x0			# 2 : 1/iak
542    vshuf.b     $vr2,$vr18,$vr2,$vr3		# 2 = 1/iak
543    vxor.v      $vr2,$vr2,$vr0			# 2 = io
544    vaddi.du    $vr3,$vr10,0x0			# 3 : 1/jak
545    vshuf.b     $vr3,$vr18,$vr3,$vr4		# 3 = 1/jak
546    vxor.v      $vr3,$vr3,$vr1			# 3 = jo
547    vaddi.du    $vr4,$vr13,0x0			# 4 : sbou
548    vshuf.b     $vr4,$vr18,$vr4,$vr2		# 4 = sbou
549    vaddi.du    $vr0,$vr12,0x0			# 0 : sbot
550    vshuf.b     $vr0,$vr18,$vr0,$vr3		# 0 = sb1t
551    vxor.v      $vr0,$vr0,$vr4			# 0 = sbox output
552
553    # add in smeared stuff
554    vxor.v      $vr0,$vr0,$vr7
555    vaddi.du    $vr7,$vr0,0x0
556    jr          $ra
557.cfi_endproc
558.size	_vpaes_schedule_round,.-_vpaes_schedule_round
559
560##
561##  .aes_schedule_transform
562##
563##  Linear-transform %vr0 according to tables at (%r11)
564##
565##  Requires that %vr9 = 0x0F0F... as in preheat
566##  Output in %vr0
567##  Clobbers %vr1, %vr2
568##
569#.type	_vpaes_schedule_transform,\@abi-omnipotent
570.align	4
571_vpaes_schedule_transform:
572.cfi_startproc
573    vori.b     $vr1,$vr9,0
574    vandn.v    $vr1,$vr1,$vr0
575    vsrli.w    $vr1,$vr1,4
576    vand.v     $vr0,$vr0,$vr9
577    vld        $vr2,$a7,0		# lo
578    vshuf.b    $vr2,$vr18,$vr2,$vr0
579    vld        $vr0,$a7,16		# hi
580    vshuf.b    $vr0,$vr18,$vr0,$vr1
581    vxor.v     $vr0,$vr0,$vr2
582    jr         $ra
583.cfi_endproc
584.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
585
586##
587##  .aes_schedule_mangle
588##
589##  Mangle vr0 from (basis-transformed) standard version
590##  to our version.
591##
592##  On encrypt,
593##    xor with 0x63
594##    multiply by circulant 0,1,1,1
595##    apply shiftrows transform
596##
597##  On decrypt,
598##    xor with 0x63
599##    multiply by "inverse mixcolumns" circulant E,B,D,9
600##    deskew
601##    apply shiftrows transform
602##
603##
604##  Writes out to (%a2), and increments or decrements it
605##  Keeps track of round number mod 4 in %a4
606##  Preserves vr0
607##  Clobbers vr1-vr5
608##
609#.type	_vpaes_schedule_mangle,\@abi-omnipotent
610.align	4
611_vpaes_schedule_mangle:
612.cfi_startproc
613    vori.b     $vr4,$vr0,0		# save vr0 for later
614    la.local   $t0,Lk_mc_forward
615    vld        $vr5,$t0,0
616    bnez       $a3,.Lschedule_mangle_dec
617
618    # encrypting
619    addi.d     $a2,$a2,16
620    la.local   $t0,Lk_s63
621    vld        $vr16,$t0,0
622    vxor.v     $vr4,$vr4,$vr16
623    vshuf.b    $vr4,$vr18,$vr4,$vr5
624    vori.b     $vr3,$vr4,0
625    vshuf.b    $vr4,$vr18,$vr4,$vr5
626    vxor.v     $vr3,$vr3,$vr4
627    vshuf.b    $vr4,$vr18,$vr4,$vr5
628    vxor.v     $vr3,$vr3,$vr4
629
630    b          .Lschedule_mangle_both
631.align	4
632.Lschedule_mangle_dec:
633    # inverse mix columns
634    la.local $a7,Lk_dksd
635    vori.b     $vr1,$vr9,0
636    vandn.v    $vr1,$vr1,$vr4
637    vsrli.w    $vr1,$vr1,4		# 1 = hi
638    vand.v     $vr4,$vr4,$vr9		# 4 = lo
639
640    vld        $vr2,$a7,0
641    vshuf.b    $vr2,$vr18,$vr2,$vr4
642    vld        $vr3,$a7,0x10
643    vshuf.b    $vr3,$vr18,$vr3,$vr1
644    vxor.v     $vr3,$vr3,$vr2
645    vshuf.b    $vr3,$vr18,$vr3,$vr5
646
647    vld        $vr2,$a7,0x20
648    vshuf.b    $vr2,$vr18,$vr2,$vr4
649    vxor.v     $vr2,$vr2,$vr3
650    vld        $vr3,$a7,0x30
651    vshuf.b    $vr3,$vr18,$vr3,$vr1
652    vxor.v     $vr3,$vr3,$vr2
653    vshuf.b    $vr3,$vr18,$vr3,$vr5
654
655    vld        $vr2,$a7,0x40
656    vshuf.b    $vr2,$vr18,$vr2,$vr4
657    vxor.v     $vr2,$vr2,$vr3
658    vld        $vr3,$a7,0x50
659    vshuf.b    $vr3,$vr18,$vr3,$vr1
660    vxor.v     $vr3,$vr3,$vr2
661    vshuf.b    $vr3,$vr18,$vr3,$vr5
662
663    vld        $vr2,$a7,0x60
664    vshuf.b    $vr2,$vr18,$vr2,$vr4
665    vxor.v     $vr2,$vr2,$vr3
666    vld        $vr3,$a7,0x70
667    vshuf.b    $vr3,$vr18,$vr3,$vr1
668    vxor.v     $vr3,$vr3,$vr2
669
670    addi.d     $a2,$a2,-16
671
672.Lschedule_mangle_both:
673    add.d      $t2,$a4,$a6
674    vld        $vr1,$t2,0
675    vshuf.b    $vr3,$vr18,$vr3,$vr1
676    addi.d     $a4,$a4,-16
677    andi       $a4,$a4,0x30
678    vst        $vr3,$a2,0
679    jirl       $zero,$ra,0
680.cfi_endproc
681.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
682
683#
684# Interface to OpenSSL
685#
686.globl	${PREFIX}_set_encrypt_key
687#.type	${PREFIX}_set_encrypt_key,\@function,3
688.align	4
689${PREFIX}_set_encrypt_key:
690.cfi_startproc
691___
692$code.=<<___;
693    addi.d   $sp,$sp,-48
694    st.d     $ra,$sp,40
695    st.d     $fp,$sp,32
696    move     $t5,$a1
697    srli.w   $t5,$t5,0x5
698    addi.w   $t5,$t5,0x5
699    st.w     $t5,$a2,240	# AES_KEY->rounds = nbits/32+5;
700
701    move     $a3,$zero
702    li.d     $a4,0x30
703    bl       _vpaes_schedule_core
704___
705$code.=<<___;
706    xor      $a0,$a0,$a0
707    ld.d     $ra,$sp,40
708    ld.d     $fp,$sp,32
709    addi.d   $sp,$sp,48
710    jirl     $zero,$ra,0
711.cfi_endproc
712.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
713
714.globl	${PREFIX}_set_decrypt_key
715#.type	${PREFIX}_set_decrypt_key,\@function,3
716.align	4
717${PREFIX}_set_decrypt_key:
718.cfi_startproc
719
720.Ldec_key_body:
721___
722$code.=<<___;
723    addi.d   $sp,$sp,-48
724    st.d     $ra,$sp,40
725    st.d     $fp,$sp,32
726
727    move     $t5,$a1
728    srli.w   $t5,$t5,5
729    addi.w   $t5,$t5,5
730    st.w     $t5,$a2,240	# AES_KEY->rounds = nbits/32+5;
731    slli.w   $t5,$t5,4
732    add.d    $t0,$a2,$t5
733    addi.d   $a2,$t0,16
734
735    li.d     $a3,0x1
736    move     $a4,$a1
737    srli.w   $a4,$a4,1
738    andi     $a4,$a4,32
739    xori     $a4,$a4,32		# nbits==192?0:32
740    bl       _vpaes_schedule_core
741
742.Ldec_key_epilogue:
743___
744$code.=<<___;
745    xor      $a0,$a0,$a0
746    ld.d     $ra,$sp,40
747    ld.d     $fp,$sp,32
748    addi.d   $sp,$sp,48
749    jirl     $zero,$ra,0
750.cfi_endproc
751.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
752
753.globl	${PREFIX}_encrypt
754#.type	${PREFIX}_encrypt,\@function,3
755.align	4
756${PREFIX}_encrypt:
757.cfi_startproc
758.Lenc_body:
759___
760$code.=<<___;
761    addi.d  $sp,$sp,-48
762    st.d    $ra,$sp,40
763    st.d    $fp,$sp,32
764    vld     $vr0,$a0,0x0
765    bl      _vpaes_preheat
766    bl      _vpaes_encrypt_core
767    vst     $vr0,$a1,0x0
768.Lenc_epilogue:
769___
770$code.=<<___;
771    ld.d     $ra,$sp,40
772    ld.d     $fp,$sp,32
773    addi.d   $sp,$sp,48
774    jirl     $zero,$ra,0
775.cfi_endproc
776.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
777
778.globl	${PREFIX}_decrypt
779#.type	${PREFIX}_decrypt,\@function,3
780.align	4
781${PREFIX}_decrypt:
782.cfi_startproc
783___
784$code.=<<___;
785    addi.d  $sp,$sp,-48
786    st.d    $ra,$sp,40
787    st.d    $fp,$sp,32
788    vld     $vr0,$a0,0x0
789    bl      _vpaes_preheat
790    bl      _vpaes_decrypt_core
791    vst     $vr0,$a1,0x0
792___
793$code.=<<___;
794    ld.d    $ra,$sp,40
795    ld.d    $fp,$sp,32
796    addi.d  $sp,$sp,48
797    jirl    $zero,$ra,0
798.cfi_endproc
799.size	${PREFIX}_decrypt,.-${PREFIX}_decrypt
800___
801{
802my ($inp,$out,$len,$key,$ivp,$enc)=("$a0","$a1","$a2","$a3","$a4","$a5");
803# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
804#                       size_t length, const AES_KEY *key,
805#                       unsigned char *ivp,const int enc);
806$code.=<<___;
807.globl	${PREFIX}_cbc_encrypt
808#.type	${PREFIX}_cbc_encrypt,\@function,6
809.align	4
810${PREFIX}_cbc_encrypt:
811.cfi_startproc
812    addi.d  $sp,$sp,-48
813    st.d    $ra,$sp,40
814    st.d    $fp,$sp,32
815
816    ori     $t0,$len,0
817    ori     $len,$key,0
818    ori     $key,$t0,0
819___
820($len,$key)=($key,$len);
821$code.=<<___;
822    addi.d  $len,$len,-16
823    blt     $len,$zero,.Lcbc_abort
824___
825$code.=<<___;
826    vld     $vr6,$ivp,0		# load IV
827    sub.d   $out,$out,$inp
828    bl      _vpaes_preheat
829    beqz    $a5,.Lcbc_dec_loop
830    b       .Lcbc_enc_loop
831.align	4
832.Lcbc_enc_loop:
833    vld     $vr0,$inp,0
834    vxor.v  $vr0,$vr0,$vr6
835    bl      _vpaes_encrypt_core
836    vori.b  $vr6,$vr0,0
837    add.d   $t0,$out,$inp
838    vst     $vr0,$t0,0
839    addi.d  $inp,$inp,16
840    addi.d  $len,$len,-16
841    bge     $len,$zero,.Lcbc_enc_loop
842    b       .Lcbc_done
843.align	4
844.Lcbc_dec_loop:
845    vld     $vr0,$inp,0
846    vori.b  $vr7,$vr0,0
847    bl      _vpaes_decrypt_core
848    vxor.v  $vr0,$vr0,$vr6
849    vori.b  $vr6,$vr7,0
850    add.d   $t0,$out,$inp
851    vst     $vr0,$t0,0
852    addi.d  $inp,$inp,16
853    addi.d  $len,$len,-16
854    bge     $len,$zero,.Lcbc_dec_loop
855.Lcbc_done:
856    vst     $vr6,$ivp,0		# save IV
857___
858$code.=<<___;
859.Lcbc_abort:
860    ld.d    $ra,$sp,40
861    ld.d    $fp,$sp,32
862    addi.d  $sp,$sp,48
863    jirl    $zero,$ra,0
864.cfi_endproc
865.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
866___
867}
868{
869$code.=<<___;
870##
871##  _aes_preheat
872##
873##  Fills register %a6 -> .aes_consts (so you can -fPIC)
874##  and %vr9-%vr15 as specified below.
875##
876#.type	_vpaes_preheat,\@abi-omnipotent
877.align	4
878_vpaes_preheat:
879.cfi_startproc
880    la.local  $a6,Lk_s0F
881    vld       $vr10,$a6,-0x20  		# Lk_inv
882    vld       $vr11,$a6,-0x10 		# Lk_inv+16
883    vld       $vr9,$a6,0		# Lk_s0F
884    vld       $vr13,$a6,0x30		# Lk_sb1
885    vld       $vr12,$a6,0x40		# Lk_sb1+16
886    vld       $vr15,$a6,0x50		# Lk_sb2
887    vld       $vr14,$a6,0x60		# Lk_sb2+16
888    vldi      $vr18,0                   # $vr18 in this program is equal to 0
889    jirl      $zero,$ra,0
890.cfi_endproc
891.size	_vpaes_preheat,.-_vpaes_preheat
892___
893}
894########################################################
895##                                                    ##
896##                     Constants                      ##
897##                                                    ##
898########################################################
899$code.=<<___;
900.section .rodata
901.align	6
902Lk_inv:	# inv, inva
903    .quad	0x0E05060F0D080110, 0x040703090A0B0C02
904    .quad	0x01040A060F0B0710, 0x030D0E0C02050809
905
906Lk_s0F:	# s0F
907    .quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
908
909Lk_ipt:	# input transform (lo, hi)
910    .quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
911    .quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
912
913Lk_sb1:	# sb1u, sb1t
914    .quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
915    .quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
916Lk_sb2:	# sb2u, sb2t
917    .quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
918    .quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
919Lk_sbo:	# sbou, sbot
920    .quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
921    .quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
922
923Lk_mc_forward:	# mc_forward
924    .quad	0x0407060500030201, 0x0C0F0E0D080B0A09
925    .quad	0x080B0A0904070605, 0x000302010C0F0E0D
926    .quad	0x0C0F0E0D080B0A09, 0x0407060500030201
927    .quad	0x000302010C0F0E0D, 0x080B0A0904070605
928
929Lk_mc_backward:# mc_backward
930    .quad	0x0605040702010003, 0x0E0D0C0F0A09080B
931    .quad	0x020100030E0D0C0F, 0x0A09080B06050407
932    .quad	0x0E0D0C0F0A09080B, 0x0605040702010003
933    .quad	0x0A09080B06050407, 0x020100030E0D0C0F
934
935Lk_sr:		# sr
936    .quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
937    .quad	0x030E09040F0A0500, 0x0B06010C07020D08
938    .quad	0x0F060D040B020900, 0x070E050C030A0108
939    .quad	0x0B0E0104070A0D00, 0x0306090C0F020508
940
941Lk_rcon:	# rcon
942    .quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
943
944Lk_s63:	# s63: all equal to 0x63 transformed
945    .quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
946
947Lk_opt:	# output transform
948    .quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
949    .quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
950
951Lk_deskew:	# deskew tables: inverts the sbox's "skew"
952    .quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
953    .quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
954
955##
956##  Decryption stuff
957##  Key schedule constants
958##
959Lk_dksd:	# decryption key schedule: invskew x*D
960    .quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
961    .quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
962Lk_dksb:	# decryption key schedule: invskew x*B
963    .quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
964    .quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
965Lk_dkse:	# decryption key schedule: invskew x*E + 0x63
966    .quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
967    .quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
968Lk_dks9:	# decryption key schedule: invskew x*9
969    .quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
970    .quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
971
972##
973##  Decryption stuff
974##  Round function constants
975##
976Lk_dipt:	# decryption input transform
977    .quad	0x0F505B040B545F00, 0x154A411E114E451A
978    .quad	0x86E383E660056500, 0x12771772F491F194
979
980Lk_dsb9:	# decryption sbox output *9*u, *9*t
981    .quad	0x851C03539A86D600, 0xCAD51F504F994CC9
982    .quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
983Lk_dsbd:	# decryption sbox output *D*u, *D*t
984    .quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
985    .quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
986Lk_dsbb:	# decryption sbox output *B*u, *B*t
987    .quad	0xD022649296B44200, 0x602646F6B0F2D404
988    .quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
989Lk_dsbe:	# decryption sbox output *E*u, *E*t
990    .quad	0x46F2929626D4D000, 0x2242600464B4F6B0
991    .quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
992Lk_dsbo:	# decryption sbox final output
993    .quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
994    .quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
995.asciz	"Vector Permutation AES for loongarch64/lsx, Mike Hamburg (Stanford University)"
996.align	6
997___
998
999
1000$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1001
1002print $code;
1003
1004close STDOUT or die "error closing STDOUT: $!";
1005