1#! /usr/bin/env perl
2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# Poly1305 hash for MIPS64.
18#
19# May 2016
20#
21# Numbers are cycles per processed byte with poly1305_blocks alone.
22#
23#		IALU/gcc
24# R1x000	5.64/+120%	(big-endian)
25# Octeon II	3.80/+280%	(little-endian)
26
27######################################################################
28# There is a number of MIPS ABI in use, O32 and N32/64 are most
29# widely used. Then there is a new contender: NUBI. It appears that if
30# one picks the latter, it's possible to arrange code in ABI neutral
31# manner. Therefore let's stick to NUBI register layout:
32#
33($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
34($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
35($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
36($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
37#
38# The return value is placed in $a0. Following coding rules facilitate
39# interoperability:
40#
41# - never ever touch $tp, "thread pointer", former $gp [o32 can be
42#   excluded from the rule, because it's specified volatile];
43# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
44#   old code];
45# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
46#
47# For reference here is register layout for N32/64 MIPS ABIs:
48#
49# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
50# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
51# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
52# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
53# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
54#
55# <appro@openssl.org>
56#
57######################################################################
58
59# $output is the last argument if it looks like a file (it has an extension)
60# $flavour is the first argument if it doesn't look like a file
61$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
62# supported flavours are o32,n32,64,nubi32,nubi64, default is o32
63$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32";
64
65die "MIPS64 only" unless ($flavour =~ /64|n32/i);
66
67$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
68$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
69
70($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
71($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
72
73$code.=<<___;
74#include "mips_arch.h"
75
76#ifdef MIPSEB
77# define MSB 0
78# define LSB 7
79#else
80# define MSB 7
81# define LSB 0
82#endif
83
84.text
85.set	noat
86.set	noreorder
87
88.align	5
89.globl	poly1305_init
90.ent	poly1305_init
91poly1305_init:
92	.frame	$sp,0,$ra
93	.set	reorder
94
95	sd	$zero,0($ctx)
96	sd	$zero,8($ctx)
97	sd	$zero,16($ctx)
98
99	beqz	$inp,.Lno_key
100
101#if defined(_MIPS_ARCH_MIPS64R6)
102	ld	$in0,0($inp)
103	ld	$in1,8($inp)
104#else
105	ldl	$in0,0+MSB($inp)
106	ldl	$in1,8+MSB($inp)
107	ldr	$in0,0+LSB($inp)
108	ldr	$in1,8+LSB($inp)
109#endif
110#ifdef	MIPSEB
111# if defined(_MIPS_ARCH_MIPS64R2)
112	dsbh	$in0,$in0		# byte swap
113	 dsbh	$in1,$in1
114	dshd	$in0,$in0
115	 dshd	$in1,$in1
116# else
117	ori	$tmp0,$zero,0xFF
118	dsll	$tmp2,$tmp0,32
119	or	$tmp0,$tmp2		# 0x000000FF000000FF
120
121	and	$tmp1,$in0,$tmp0	# byte swap
122	 and	$tmp3,$in1,$tmp0
123	dsrl	$tmp2,$in0,24
124	 dsrl	$tmp4,$in1,24
125	dsll	$tmp1,24
126	 dsll	$tmp3,24
127	and	$tmp2,$tmp0
128	 and	$tmp4,$tmp0
129	dsll	$tmp0,8			# 0x0000FF000000FF00
130	or	$tmp1,$tmp2
131	 or	$tmp3,$tmp4
132	and	$tmp2,$in0,$tmp0
133	 and	$tmp4,$in1,$tmp0
134	dsrl	$in0,8
135	 dsrl	$in1,8
136	dsll	$tmp2,8
137	 dsll	$tmp4,8
138	and	$in0,$tmp0
139	 and	$in1,$tmp0
140	or	$tmp1,$tmp2
141	 or	$tmp3,$tmp4
142	or	$in0,$tmp1
143	 or	$in1,$tmp3
144	dsrl	$tmp1,$in0,32
145	 dsrl	$tmp3,$in1,32
146	dsll	$in0,32
147	 dsll	$in1,32
148	or	$in0,$tmp1
149	 or	$in1,$tmp3
150# endif
151#endif
152	li	$tmp0,1
153	dsll	$tmp0,32
154	daddiu	$tmp0,-63
155	dsll	$tmp0,28
156	daddiu	$tmp0,-1		# 0ffffffc0fffffff
157
158	and	$in0,$tmp0
159	daddiu	$tmp0,-3		# 0ffffffc0ffffffc
160	and	$in1,$tmp0
161
162	sd	$in0,24($ctx)
163	dsrl	$tmp0,$in1,2
164	sd	$in1,32($ctx)
165	daddu	$tmp0,$in1		# s1 = r1 + (r1 >> 2)
166	sd	$tmp0,40($ctx)
167
168.Lno_key:
169	li	$v0,0			# return 0
170	jr	$ra
171.end	poly1305_init
172___
173{
174my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) =
175   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
176
177$code.=<<___;
178.align	5
179.globl	poly1305_blocks
180.ent	poly1305_blocks
181poly1305_blocks:
182	.set	noreorder
183	dsrl	$len,4			# number of complete blocks
184	bnez	$len,poly1305_blocks_internal
185	nop
186	jr	$ra
187	nop
188.end	poly1305_blocks
189
190.align	5
191.ent	poly1305_blocks_internal
192poly1305_blocks_internal:
193	.frame	$sp,6*8,$ra
194	.mask	$SAVED_REGS_MASK,-8
195	.set	noreorder
196	dsubu	$sp,6*8
197	sd	$s5,40($sp)
198	sd	$s4,32($sp)
199___
200$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
201	sd	$s3,24($sp)
202	sd	$s2,16($sp)
203	sd	$s1,8($sp)
204	sd	$s0,0($sp)
205___
206$code.=<<___;
207	.set	reorder
208
209	ld	$h0,0($ctx)		# load hash value
210	ld	$h1,8($ctx)
211	ld	$h2,16($ctx)
212
213	ld	$r0,24($ctx)		# load key
214	ld	$r1,32($ctx)
215	ld	$s1,40($ctx)
216
217.Loop:
218#if defined(_MIPS_ARCH_MIPS64R6)
219	ld	$in0,0($inp)		# load input
220	ld	$in1,8($inp)
221#else
222	ldl	$in0,0+MSB($inp)	# load input
223	ldl	$in1,8+MSB($inp)
224	ldr	$in0,0+LSB($inp)
225	ldr	$in1,8+LSB($inp)
226#endif
227	daddiu	$len,-1
228	daddiu	$inp,16
229#ifdef	MIPSEB
230# if defined(_MIPS_ARCH_MIPS64R2)
231	dsbh	$in0,$in0		# byte swap
232	 dsbh	$in1,$in1
233	dshd	$in0,$in0
234	 dshd	$in1,$in1
235# else
236	ori	$tmp0,$zero,0xFF
237	dsll	$tmp2,$tmp0,32
238	or	$tmp0,$tmp2		# 0x000000FF000000FF
239
240	and	$tmp1,$in0,$tmp0	# byte swap
241	 and	$tmp3,$in1,$tmp0
242	dsrl	$tmp2,$in0,24
243	 dsrl	$tmp4,$in1,24
244	dsll	$tmp1,24
245	 dsll	$tmp3,24
246	and	$tmp2,$tmp0
247	 and	$tmp4,$tmp0
248	dsll	$tmp0,8			# 0x0000FF000000FF00
249	or	$tmp1,$tmp2
250	 or	$tmp3,$tmp4
251	and	$tmp2,$in0,$tmp0
252	 and	$tmp4,$in1,$tmp0
253	dsrl	$in0,8
254	 dsrl	$in1,8
255	dsll	$tmp2,8
256	 dsll	$tmp4,8
257	and	$in0,$tmp0
258	 and	$in1,$tmp0
259	or	$tmp1,$tmp2
260	 or	$tmp3,$tmp4
261	or	$in0,$tmp1
262	 or	$in1,$tmp3
263	dsrl	$tmp1,$in0,32
264	 dsrl	$tmp3,$in1,32
265	dsll	$in0,32
266	 dsll	$in1,32
267	or	$in0,$tmp1
268	 or	$in1,$tmp3
269# endif
270#endif
271	daddu	$h0,$in0		# accumulate input
272	daddu	$h1,$in1
273	sltu	$tmp0,$h0,$in0
274	sltu	$tmp1,$h1,$in1
275	daddu	$h1,$tmp0
276
277	dmultu	($r0,$h0)		# h0*r0
278	 daddu	$h2,$padbit
279	 sltu	$tmp0,$h1,$tmp0
280	mflo	($d0,$r0,$h0)
281	mfhi	($d1,$r0,$h0)
282
283	dmultu	($s1,$h1)		# h1*5*r1
284	 daddu	$tmp0,$tmp1
285	 daddu	$h2,$tmp0
286	mflo	($tmp0,$s1,$h1)
287	mfhi	($tmp1,$s1,$h1)
288
289	dmultu	($r1,$h0)		# h0*r1
290	 daddu	$d0,$tmp0
291	 daddu	$d1,$tmp1
292	mflo	($tmp2,$r1,$h0)
293	mfhi	($d2,$r1,$h0)
294	 sltu	$tmp0,$d0,$tmp0
295	 daddu	$d1,$tmp0
296
297	dmultu	($r0,$h1)		# h1*r0
298	 daddu	$d1,$tmp2
299	 sltu	$tmp2,$d1,$tmp2
300	mflo	($tmp0,$r0,$h1)
301	mfhi	($tmp1,$r0,$h1)
302	 daddu	$d2,$tmp2
303
304	dmultu	($s1,$h2)		# h2*5*r1
305	 daddu	$d1,$tmp0
306	 daddu	$d2,$tmp1
307	mflo	($tmp2,$s1,$h2)
308
309	dmultu	($r0,$h2)		# h2*r0
310	 sltu	$tmp0,$d1,$tmp0
311	 daddu	$d2,$tmp0
312	mflo	($tmp3,$r0,$h2)
313
314	daddu	$d1,$tmp2
315	daddu	$d2,$tmp3
316	sltu	$tmp2,$d1,$tmp2
317	daddu	$d2,$tmp2
318
319	li	$tmp0,-4		# final reduction
320	and	$tmp0,$d2
321	dsrl	$tmp1,$d2,2
322	andi	$h2,$d2,3
323	daddu	$tmp0,$tmp1
324	daddu	$h0,$d0,$tmp0
325	sltu	$tmp0,$h0,$tmp0
326	daddu	$h1,$d1,$tmp0
327	sltu	$tmp0,$h1,$tmp0
328	daddu	$h2,$h2,$tmp0
329
330	bnez	$len,.Loop
331
332	sd	$h0,0($ctx)		# store hash value
333	sd	$h1,8($ctx)
334	sd	$h2,16($ctx)
335
336	.set	noreorder
337	ld	$s5,40($sp)		# epilogue
338	ld	$s4,32($sp)
339___
340$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi epilogue
341	ld	$s3,24($sp)
342	ld	$s2,16($sp)
343	ld	$s1,8($sp)
344	ld	$s0,0($sp)
345___
346$code.=<<___;
347	jr	$ra
348	daddu	$sp,6*8
349.end	poly1305_blocks_internal
350___
351}
352{
353my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
354
355$code.=<<___;
356.align	5
357.globl	poly1305_emit
358.ent	poly1305_emit
359poly1305_emit:
360	.frame	$sp,0,$ra
361	.set	reorder
362
363	ld	$tmp0,0($ctx)
364	ld	$tmp1,8($ctx)
365	ld	$tmp2,16($ctx)
366
367	daddiu	$in0,$tmp0,5		# compare to modulus
368	sltiu	$tmp3,$in0,5
369	daddu	$in1,$tmp1,$tmp3
370	sltu	$tmp3,$in1,$tmp3
371	daddu	$tmp2,$tmp2,$tmp3
372
373	dsrl	$tmp2,2			# see if it carried/borrowed
374	dsubu	$tmp2,$zero,$tmp2
375	nor	$tmp3,$zero,$tmp2
376
377	and	$in0,$tmp2
378	and	$tmp0,$tmp3
379	and	$in1,$tmp2
380	and	$tmp1,$tmp3
381	or	$in0,$tmp0
382	or	$in1,$tmp1
383
384	lwu	$tmp0,0($nonce)		# load nonce
385	lwu	$tmp1,4($nonce)
386	lwu	$tmp2,8($nonce)
387	lwu	$tmp3,12($nonce)
388	dsll	$tmp1,32
389	dsll	$tmp3,32
390	or	$tmp0,$tmp1
391	or	$tmp2,$tmp3
392
393	daddu	$in0,$tmp0		# accumulate nonce
394	daddu	$in1,$tmp2
395	sltu	$tmp0,$in0,$tmp0
396	daddu	$in1,$tmp0
397
398	dsrl	$tmp0,$in0,8		# write mac value
399	dsrl	$tmp1,$in0,16
400	dsrl	$tmp2,$in0,24
401	sb	$in0,0($mac)
402	dsrl	$tmp3,$in0,32
403	sb	$tmp0,1($mac)
404	dsrl	$tmp0,$in0,40
405	sb	$tmp1,2($mac)
406	dsrl	$tmp1,$in0,48
407	sb	$tmp2,3($mac)
408	dsrl	$tmp2,$in0,56
409	sb	$tmp3,4($mac)
410	dsrl	$tmp3,$in1,8
411	sb	$tmp0,5($mac)
412	dsrl	$tmp0,$in1,16
413	sb	$tmp1,6($mac)
414	dsrl	$tmp1,$in1,24
415	sb	$tmp2,7($mac)
416
417	sb	$in1,8($mac)
418	dsrl	$tmp2,$in1,32
419	sb	$tmp3,9($mac)
420	dsrl	$tmp3,$in1,40
421	sb	$tmp0,10($mac)
422	dsrl	$tmp0,$in1,48
423	sb	$tmp1,11($mac)
424	dsrl	$tmp1,$in1,56
425	sb	$tmp2,12($mac)
426	sb	$tmp3,13($mac)
427	sb	$tmp0,14($mac)
428	sb	$tmp1,15($mac)
429
430	jr	$ra
431.end	poly1305_emit
432.rdata
433.asciiz	"Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>"
434.align	2
435___
436}
437
438$output and open STDOUT,">$output";
439print $code;
440close STDOUT or die "error closing STDOUT: $!";
441
442