xref: /openssl/crypto/bn/asm/armv8-mont.pl (revision b26894ec)
1#! /usr/bin/env perl
2# Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# March 2015
18#
19# "Teaser" Montgomery multiplication module for ARMv8. Needs more
20# work. While it does improve RSA sign performance by 20-30% (less for
21# longer keys) on most processors, for some reason RSA2048 is not
22# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23# instruction issue rate is limited on processor in question, meaning
24# that dedicated squaring procedure is a must. Well, actually all
25# contemporary AArch64 processors seem to have limited multiplication
26# issue rate, i.e. they can't issue multiplication every cycle, which
27# explains moderate improvement coefficients in comparison to
28# compiler-generated code. Recall that compiler is instructed to use
29# umulh and therefore uses same amount of multiplication instructions
30# to do the job. Assembly's edge is to minimize number of "collateral"
31# instructions and of course instruction scheduling.
32#
33# April 2015
34#
35# Squaring procedure that handles lengths divisible by 8 improves
36# RSA/DSA performance by 25-40-60% depending on processor and key
37# length. Overall improvement coefficients are always positive in
38# comparison to compiler-generated code. On Cortex-A57 improvement
39# is still modest on longest key lengths, while others exhibit e.g.
40# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41# on Cortex-A57 and ~60-100% faster on others.
42
43# $output is the last argument if it looks like a file (it has an extension)
44# $flavour is the first argument if it doesn't look like a file
45my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
51die "can't locate arm-xlate.pl";
52
53open OUT,"| \"$^X\" $xlate $flavour \"$output\""
54    or die "can't call $xlate: $1";
55*STDOUT=*OUT;
56
57($lo0,$hi0,$aj,$m0,$alo,$ahi,
58 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
59 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
60
61# int bn_mul_mont(
62$rp="x0";	# BN_ULONG *rp,
63$ap="x1";	# const BN_ULONG *ap,
64$bp="x2";	# const BN_ULONG *bp,
65$np="x3";	# const BN_ULONG *np,
66$n0="x4";	# const BN_ULONG *n0,
67$num="x5";	# int num);
68
69$code.=<<___;
70#include "arm_arch.h"
71#ifndef	__KERNEL__
72.extern OPENSSL_armv8_rsa_neonized
73.hidden OPENSSL_armv8_rsa_neonized
74#endif
75.text
76
77.globl	bn_mul_mont
78.type	bn_mul_mont,%function
79.align	5
80bn_mul_mont:
81	AARCH64_SIGN_LINK_REGISTER
82.Lbn_mul_mont:
83	tst	$num,#3
84	b.ne	.Lmul_mont
85	cmp	$num,#32
86	b.le	.Lscalar_impl
87#ifndef	__KERNEL__
88#ifndef	__AARCH64EB__
89	adrp	x17,OPENSSL_armv8_rsa_neonized
90	ldr	w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
91	cbnz	w17, bn_mul8x_mont_neon
92#endif
93#endif
94
95.Lscalar_impl:
96	tst	$num,#7
97	b.eq	__bn_sqr8x_mont
98	tst	$num,#3
99	b.eq	__bn_mul4x_mont
100
101.Lmul_mont:
102	stp	x29,x30,[sp,#-64]!
103	add	x29,sp,#0
104	stp	x19,x20,[sp,#16]
105	stp	x21,x22,[sp,#32]
106	stp	x23,x24,[sp,#48]
107
108	ldr	$m0,[$bp],#8		// bp[0]
109	sub	$tp,sp,$num,lsl#3
110	ldp	$hi0,$aj,[$ap],#16	// ap[0..1]
111	lsl	$num,$num,#3
112	ldr	$n0,[$n0]		// *n0
113	and	$tp,$tp,#-16		// ABI says so
114	ldp	$hi1,$nj,[$np],#16	// np[0..1]
115
116	mul	$lo0,$hi0,$m0		// ap[0]*bp[0]
117	sub	$j,$num,#16		// j=num-2
118	umulh	$hi0,$hi0,$m0
119	mul	$alo,$aj,$m0		// ap[1]*bp[0]
120	umulh	$ahi,$aj,$m0
121
122	mul	$m1,$lo0,$n0		// "tp[0]"*n0
123	mov	sp,$tp			// alloca
124
125	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
126	umulh	$hi1,$hi1,$m1
127	mul	$nlo,$nj,$m1		// np[1]*m1
128	// (*)	adds	$lo1,$lo1,$lo0	// discarded
129	// (*)	As for removal of first multiplication and addition
130	//	instructions. The outcome of first addition is
131	//	guaranteed to be zero, which leaves two computationally
132	//	significant outcomes: it either carries or not. Then
133	//	question is when does it carry? Is there alternative
134	//	way to deduce it? If you follow operations, you can
135	//	observe that condition for carry is quite simple:
136	//	$lo0 being non-zero. So that carry can be calculated
137	//	by adding -1 to $lo0. That's what next instruction does.
138	subs	xzr,$lo0,#1		// (*)
139	umulh	$nhi,$nj,$m1
140	adc	$hi1,$hi1,xzr
141	cbz	$j,.L1st_skip
142
143.L1st:
144	ldr	$aj,[$ap],#8
145	adds	$lo0,$alo,$hi0
146	sub	$j,$j,#8		// j--
147	adc	$hi0,$ahi,xzr
148
149	ldr	$nj,[$np],#8
150	adds	$lo1,$nlo,$hi1
151	mul	$alo,$aj,$m0		// ap[j]*bp[0]
152	adc	$hi1,$nhi,xzr
153	umulh	$ahi,$aj,$m0
154
155	adds	$lo1,$lo1,$lo0
156	mul	$nlo,$nj,$m1		// np[j]*m1
157	adc	$hi1,$hi1,xzr
158	umulh	$nhi,$nj,$m1
159	str	$lo1,[$tp],#8		// tp[j-1]
160	cbnz	$j,.L1st
161
162.L1st_skip:
163	adds	$lo0,$alo,$hi0
164	sub	$ap,$ap,$num		// rewind $ap
165	adc	$hi0,$ahi,xzr
166
167	adds	$lo1,$nlo,$hi1
168	sub	$np,$np,$num		// rewind $np
169	adc	$hi1,$nhi,xzr
170
171	adds	$lo1,$lo1,$lo0
172	sub	$i,$num,#8		// i=num-1
173	adcs	$hi1,$hi1,$hi0
174
175	adc	$ovf,xzr,xzr		// upmost overflow bit
176	stp	$lo1,$hi1,[$tp]
177
178.Louter:
179	ldr	$m0,[$bp],#8		// bp[i]
180	ldp	$hi0,$aj,[$ap],#16
181	ldr	$tj,[sp]		// tp[0]
182	add	$tp,sp,#8
183
184	mul	$lo0,$hi0,$m0		// ap[0]*bp[i]
185	sub	$j,$num,#16		// j=num-2
186	umulh	$hi0,$hi0,$m0
187	ldp	$hi1,$nj,[$np],#16
188	mul	$alo,$aj,$m0		// ap[1]*bp[i]
189	adds	$lo0,$lo0,$tj
190	umulh	$ahi,$aj,$m0
191	adc	$hi0,$hi0,xzr
192
193	mul	$m1,$lo0,$n0
194	sub	$i,$i,#8		// i--
195
196	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
197	umulh	$hi1,$hi1,$m1
198	mul	$nlo,$nj,$m1		// np[1]*m1
199	// (*)	adds	$lo1,$lo1,$lo0
200	subs	xzr,$lo0,#1		// (*)
201	umulh	$nhi,$nj,$m1
202	cbz	$j,.Linner_skip
203
204.Linner:
205	ldr	$aj,[$ap],#8
206	adc	$hi1,$hi1,xzr
207	ldr	$tj,[$tp],#8		// tp[j]
208	adds	$lo0,$alo,$hi0
209	sub	$j,$j,#8		// j--
210	adc	$hi0,$ahi,xzr
211
212	adds	$lo1,$nlo,$hi1
213	ldr	$nj,[$np],#8
214	adc	$hi1,$nhi,xzr
215
216	mul	$alo,$aj,$m0		// ap[j]*bp[i]
217	adds	$lo0,$lo0,$tj
218	umulh	$ahi,$aj,$m0
219	adc	$hi0,$hi0,xzr
220
221	mul	$nlo,$nj,$m1		// np[j]*m1
222	adds	$lo1,$lo1,$lo0
223	umulh	$nhi,$nj,$m1
224	stur	$lo1,[$tp,#-16]		// tp[j-1]
225	cbnz	$j,.Linner
226
227.Linner_skip:
228	ldr	$tj,[$tp],#8		// tp[j]
229	adc	$hi1,$hi1,xzr
230	adds	$lo0,$alo,$hi0
231	sub	$ap,$ap,$num		// rewind $ap
232	adc	$hi0,$ahi,xzr
233
234	adds	$lo1,$nlo,$hi1
235	sub	$np,$np,$num		// rewind $np
236	adcs	$hi1,$nhi,$ovf
237	adc	$ovf,xzr,xzr
238
239	adds	$lo0,$lo0,$tj
240	adc	$hi0,$hi0,xzr
241
242	adds	$lo1,$lo1,$lo0
243	adcs	$hi1,$hi1,$hi0
244	adc	$ovf,$ovf,xzr		// upmost overflow bit
245	stp	$lo1,$hi1,[$tp,#-16]
246
247	cbnz	$i,.Louter
248
249	// Final step. We see if result is larger than modulus, and
250	// if it is, subtract the modulus. But comparison implies
251	// subtraction. So we subtract modulus, see if it borrowed,
252	// and conditionally copy original value.
253	ldr	$tj,[sp]		// tp[0]
254	add	$tp,sp,#8
255	ldr	$nj,[$np],#8		// np[0]
256	subs	$j,$num,#8		// j=num-1 and clear borrow
257	mov	$ap,$rp
258.Lsub:
259	sbcs	$aj,$tj,$nj		// tp[j]-np[j]
260	ldr	$tj,[$tp],#8
261	sub	$j,$j,#8		// j--
262	ldr	$nj,[$np],#8
263	str	$aj,[$ap],#8		// rp[j]=tp[j]-np[j]
264	cbnz	$j,.Lsub
265
266	sbcs	$aj,$tj,$nj
267	sbcs	$ovf,$ovf,xzr		// did it borrow?
268	str	$aj,[$ap],#8		// rp[num-1]
269
270	ldr	$tj,[sp]		// tp[0]
271	add	$tp,sp,#8
272	ldr	$aj,[$rp],#8		// rp[0]
273	sub	$num,$num,#8		// num--
274	nop
275.Lcond_copy:
276	sub	$num,$num,#8		// num--
277	csel	$nj,$tj,$aj,lo		// did it borrow?
278	ldr	$tj,[$tp],#8
279	ldr	$aj,[$rp],#8
280	stur	xzr,[$tp,#-16]		// wipe tp
281	stur	$nj,[$rp,#-16]
282	cbnz	$num,.Lcond_copy
283
284	csel	$nj,$tj,$aj,lo
285	stur	xzr,[$tp,#-8]		// wipe tp
286	stur	$nj,[$rp,#-8]
287
288	ldp	x19,x20,[x29,#16]
289	mov	sp,x29
290	ldp	x21,x22,[x29,#32]
291	mov	x0,#1
292	ldp	x23,x24,[x29,#48]
293	ldr	x29,[sp],#64
294	AARCH64_VALIDATE_LINK_REGISTER
295	ret
296.size	bn_mul_mont,.-bn_mul_mont
297___
298{
299my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
300my ($Z,$Temp)=("v4.16b","v5");
301my @ACC=map("v$_",(6..13));
302my ($Bi,$Ni,$M0)=map("v$_",(28..30));
303my $sBi="s28";
304my $sM0="s30";
305my $zero="v14";
306my $temp="v15";
307my $ACCTemp="v16";
308
309my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
310my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
311
312$code.=<<___;
313.type	bn_mul8x_mont_neon,%function
314.align	5
315bn_mul8x_mont_neon:
316	// Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
317	// only from bn_mul_mont which has already signed the return address.
318	stp	x29,x30,[sp,#-80]!
319	mov	x16,sp
320	stp	d8,d9,[sp,#16]
321	stp	d10,d11,[sp,#32]
322	stp	d12,d13,[sp,#48]
323	stp	d14,d15,[sp,#64]
324	lsl	$num,$num,#1
325	eor	$zero.16b,$zero.16b,$zero.16b
326
327.align	4
328.LNEON_8n:
329	eor	@ACC[0].16b,@ACC[0].16b,@ACC[0].16b
330	sub	$toutptr,sp,#128
331	eor	@ACC[1].16b,@ACC[1].16b,@ACC[1].16b
332	sub	$toutptr,$toutptr,$num,lsl#4
333	eor	@ACC[2].16b,@ACC[2].16b,@ACC[2].16b
334	and	$toutptr,$toutptr,#-64
335	eor	@ACC[3].16b,@ACC[3].16b,@ACC[3].16b
336	mov	sp,$toutptr		// alloca
337	eor	@ACC[4].16b,@ACC[4].16b,@ACC[4].16b
338	add	$toutptr,$toutptr,#256
339	eor	@ACC[5].16b,@ACC[5].16b,@ACC[5].16b
340	sub	$inner,$num,#8
341	eor	@ACC[6].16b,@ACC[6].16b,@ACC[6].16b
342	eor	@ACC[7].16b,@ACC[7].16b,@ACC[7].16b
343
344.LNEON_8n_init:
345	st1	{@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
346	subs	$inner,$inner,#8
347	st1	{@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
348	st1	{@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
349	st1	{@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
350	bne	.LNEON_8n_init
351
352	add	$tinptr,sp,#256
353	ld1	{$A0.4s,$A1.4s},[$aptr],#32
354	add	$bnptr,sp,#8
355	ldr	$sM0,[$n0],#4
356	mov	$outer,$num
357	b	.LNEON_8n_outer
358
359.align	4
360.LNEON_8n_outer:
361	ldr	$sBi,[$bptr],#4   // *b++
362	uxtl	$Bi.4s,$Bi.4h
363	add	$toutptr,sp,#128
364	ld1	{$N0.4s,$N1.4s},[$nptr],#32
365
366	umlal	@ACC[0].2d,$Bi.2s,$A0.s[0]
367	umlal	@ACC[1].2d,$Bi.2s,$A0.s[1]
368	umlal	@ACC[2].2d,$Bi.2s,$A0.s[2]
369	shl	$Ni.2d,@ACC[0].2d,#16
370	ext	$Ni.16b,$Ni.16b,$Ni.16b,#8
371	umlal	@ACC[3].2d,$Bi.2s,$A0.s[3]
372	add	$Ni.2d,$Ni.2d,@ACC[0].2d
373	umlal	@ACC[4].2d,$Bi.2s,$A1.s[0]
374	mul	$Ni.2s,$Ni.2s,$M0.2s
375	umlal	@ACC[5].2d,$Bi.2s,$A1.s[1]
376	st1	{$Bi.2s},[sp]		// put aside smashed b[8*i+0]
377	umlal	@ACC[6].2d,$Bi.2s,$A1.s[2]
378	uxtl	$Ni.4s,$Ni.4h
379	umlal	@ACC[7].2d,$Bi.2s,$A1.s[3]
380___
381for ($i=0; $i<7;) {
382$code.=<<___;
383	ldr	$sBi,[$bptr],#4   // *b++
384	umlal	@ACC[0].2d,$Ni.2s,$N0.s[0]
385	umlal	@ACC[1].2d,$Ni.2s,$N0.s[1]
386	uxtl	$Bi.4s,$Bi.4h
387	umlal	@ACC[2].2d,$Ni.2s,$N0.s[2]
388	ushr	$temp.2d,@ACC[0].2d,#16
389	umlal	@ACC[3].2d,$Ni.2s,$N0.s[3]
390	umlal	@ACC[4].2d,$Ni.2s,$N1.s[0]
391	ext	@ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
392	add	@ACC[0].2d,@ACC[0].2d,$temp.2d
393	umlal	@ACC[5].2d,$Ni.2s,$N1.s[1]
394	ushr	@ACC[0].2d,@ACC[0].2d,#16
395	umlal	@ACC[6].2d,$Ni.2s,$N1.s[2]
396	umlal	@ACC[7].2d,$Ni.2s,$N1.s[3]
397	add	$ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
398	ins	@ACC[1].d[0],$ACCTemp.d[0]
399	st1	{$Ni.2s},[$bnptr],#8	// put aside smashed m[8*i+$i]
400___
401	push(@ACC,shift(@ACC));	$i++;
402$code.=<<___;
403	umlal	@ACC[0].2d,$Bi.2s,$A0.s[0]
404	ld1	{@ACC[7].2d},[$tinptr],#16
405	umlal	@ACC[1].2d,$Bi.2s,$A0.s[1]
406	umlal	@ACC[2].2d,$Bi.2s,$A0.s[2]
407	shl	$Ni.2d,@ACC[0].2d,#16
408	ext	$Ni.16b,$Ni.16b,$Ni.16b,#8
409	umlal	@ACC[3].2d,$Bi.2s,$A0.s[3]
410	add	$Ni.2d,$Ni.2d,@ACC[0].2d
411	umlal	@ACC[4].2d,$Bi.2s,$A1.s[0]
412	mul	$Ni.2s,$Ni.2s,$M0.2s
413	umlal	@ACC[5].2d,$Bi.2s,$A1.s[1]
414	st1	{$Bi.2s},[$bnptr],#8	// put aside smashed b[8*i+$i]
415	umlal	@ACC[6].2d,$Bi.2s,$A1.s[2]
416	uxtl	$Ni.4s,$Ni.4h
417	umlal	@ACC[7].2d,$Bi.2s,$A1.s[3]
418___
419}
420$code.=<<___;
421	ld1	{$Bi.2s},[sp]		// pull smashed b[8*i+0]
422	umlal	@ACC[0].2d,$Ni.2s,$N0.s[0]
423	ld1	{$A0.4s,$A1.4s},[$aptr],#32
424	umlal	@ACC[1].2d,$Ni.2s,$N0.s[1]
425	umlal	@ACC[2].2d,$Ni.2s,$N0.s[2]
426	mov	$Temp.16b,@ACC[0].16b
427	ushr	$Temp.2d,$Temp.2d,#16
428	ext	@ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
429	umlal	@ACC[3].2d,$Ni.2s,$N0.s[3]
430	umlal	@ACC[4].2d,$Ni.2s,$N1.s[0]
431	add	@ACC[0].2d,@ACC[0].2d,$Temp.2d
432	umlal	@ACC[5].2d,$Ni.2s,$N1.s[1]
433	ushr	@ACC[0].2d,@ACC[0].2d,#16
434	eor	$temp.16b,$temp.16b,$temp.16b
435	ins	@ACC[0].d[1],$temp.d[0]
436	umlal	@ACC[6].2d,$Ni.2s,$N1.s[2]
437	umlal	@ACC[7].2d,$Ni.2s,$N1.s[3]
438	add	@ACC[1].2d,@ACC[1].2d,@ACC[0].2d
439	st1	{$Ni.2s},[$bnptr],#8	// put aside smashed m[8*i+$i]
440	add	$bnptr,sp,#8		// rewind
441___
442	push(@ACC,shift(@ACC));
443$code.=<<___;
444	sub	$inner,$num,#8
445	b	.LNEON_8n_inner
446
447.align	4
448.LNEON_8n_inner:
449	subs	$inner,$inner,#8
450	umlal	@ACC[0].2d,$Bi.2s,$A0.s[0]
451	ld1	{@ACC[7].2d},[$tinptr]
452	umlal	@ACC[1].2d,$Bi.2s,$A0.s[1]
453	ld1	{$Ni.2s},[$bnptr],#8	// pull smashed m[8*i+0]
454	umlal	@ACC[2].2d,$Bi.2s,$A0.s[2]
455	ld1	{$N0.4s,$N1.4s},[$nptr],#32
456	umlal	@ACC[3].2d,$Bi.2s,$A0.s[3]
457	b.eq	.LInner_jump
458	add	$tinptr,$tinptr,#16	// don't advance in last iteration
459.LInner_jump:
460	umlal	@ACC[4].2d,$Bi.2s,$A1.s[0]
461	umlal	@ACC[5].2d,$Bi.2s,$A1.s[1]
462	umlal	@ACC[6].2d,$Bi.2s,$A1.s[2]
463	umlal	@ACC[7].2d,$Bi.2s,$A1.s[3]
464___
465for ($i=1; $i<8; $i++) {
466$code.=<<___;
467	ld1	{$Bi.2s},[$bnptr],#8	// pull smashed b[8*i+$i]
468	umlal	@ACC[0].2d,$Ni.2s,$N0.s[0]
469	umlal	@ACC[1].2d,$Ni.2s,$N0.s[1]
470	umlal	@ACC[2].2d,$Ni.2s,$N0.s[2]
471	umlal	@ACC[3].2d,$Ni.2s,$N0.s[3]
472	umlal	@ACC[4].2d,$Ni.2s,$N1.s[0]
473	umlal	@ACC[5].2d,$Ni.2s,$N1.s[1]
474	umlal	@ACC[6].2d,$Ni.2s,$N1.s[2]
475	umlal	@ACC[7].2d,$Ni.2s,$N1.s[3]
476	st1	{@ACC[0].2d},[$toutptr],#16
477___
478	push(@ACC,shift(@ACC));
479$code.=<<___;
480	umlal	@ACC[0].2d,$Bi.2s,$A0.s[0]
481	ld1	{@ACC[7].2d},[$tinptr]
482	umlal	@ACC[1].2d,$Bi.2s,$A0.s[1]
483	ld1	{$Ni.2s},[$bnptr],#8	// pull smashed m[8*i+$i]
484	umlal	@ACC[2].2d,$Bi.2s,$A0.s[2]
485	b.eq	.LInner_jump$i
486	add	$tinptr,$tinptr,#16	// don't advance in last iteration
487.LInner_jump$i:
488	umlal	@ACC[3].2d,$Bi.2s,$A0.s[3]
489	umlal	@ACC[4].2d,$Bi.2s,$A1.s[0]
490	umlal	@ACC[5].2d,$Bi.2s,$A1.s[1]
491	umlal	@ACC[6].2d,$Bi.2s,$A1.s[2]
492	umlal	@ACC[7].2d,$Bi.2s,$A1.s[3]
493___
494}
495$code.=<<___;
496	b.ne	.LInner_after_rewind$i
497	sub	$aptr,$aptr,$num,lsl#2	// rewind
498.LInner_after_rewind$i:
499	umlal	@ACC[0].2d,$Ni.2s,$N0.s[0]
500	ld1	{$Bi.2s},[sp]		// pull smashed b[8*i+0]
501	umlal	@ACC[1].2d,$Ni.2s,$N0.s[1]
502	ld1	{$A0.4s,$A1.4s},[$aptr],#32
503	umlal	@ACC[2].2d,$Ni.2s,$N0.s[2]
504	add	$bnptr,sp,#8		// rewind
505	umlal	@ACC[3].2d,$Ni.2s,$N0.s[3]
506	umlal	@ACC[4].2d,$Ni.2s,$N1.s[0]
507	umlal	@ACC[5].2d,$Ni.2s,$N1.s[1]
508	umlal	@ACC[6].2d,$Ni.2s,$N1.s[2]
509	st1	{@ACC[0].2d},[$toutptr],#16
510	umlal	@ACC[7].2d,$Ni.2s,$N1.s[3]
511
512	bne	.LNEON_8n_inner
513___
514	push(@ACC,shift(@ACC));
515$code.=<<___;
516	add	$tinptr,sp,#128
517	st1	{@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
518	eor	$N0.16b,$N0.16b,$N0.16b	// $N0
519	st1	{@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
520	eor	$N1.16b,$N1.16b,$N1.16b	// $N1
521	st1	{@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
522	st1	{@ACC[6].2d},[$toutptr]
523
524	subs	$outer,$outer,#8
525	ld1	{@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
526	ld1	{@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
527	ld1	{@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
528	ld1	{@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
529
530	b.eq	.LInner_8n_jump_2steps
531	sub	$nptr,$nptr,$num,lsl#2	// rewind
532	b	.LNEON_8n_outer
533
534.LInner_8n_jump_2steps:
535	add	$toutptr,sp,#128
536	st1	{$N0.2d,$N1.2d}, [sp],#32	// start wiping stack frame
537	mov	$Temp.16b,@ACC[0].16b
538	ushr	$temp.2d,@ACC[0].2d,#16
539	ext	@ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
540	st1	{$N0.2d,$N1.2d}, [sp],#32
541	add	@ACC[0].2d,@ACC[0].2d,$temp.2d
542	st1	{$N0.2d,$N1.2d}, [sp],#32
543	ushr	$temp.2d,@ACC[0].2d,#16
544	st1	{$N0.2d,$N1.2d}, [sp],#32
545	zip1	@ACC[0].4h,$Temp.4h,@ACC[0].4h
546	ins	$temp.d[1],$zero.d[0]
547
548	mov	$inner,$num
549	b	.LNEON_tail_entry
550
551.align	4
552.LNEON_tail:
553	add	@ACC[0].2d,@ACC[0].2d,$temp.2d
554	mov	$Temp.16b,@ACC[0].16b
555	ushr	$temp.2d,@ACC[0].2d,#16
556	ext	@ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
557	ld1	{@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
558	add	@ACC[0].2d,@ACC[0].2d,$temp.2d
559	ld1	{@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
560	ushr	$temp.2d,@ACC[0].2d,#16
561	ld1	{@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
562	zip1	@ACC[0].4h,$Temp.4h,@ACC[0].4h
563	ins	$temp.d[1],$zero.d[0]
564
565.LNEON_tail_entry:
566___
567for ($i=1; $i<8; $i++) {
568$code.=<<___;
569	add	@ACC[1].2d,@ACC[1].2d,$temp.2d
570	st1	{@ACC[0].s}[0], [$toutptr],#4
571	ushr	$temp.2d,@ACC[1].2d,#16
572	mov	$Temp.16b,@ACC[1].16b
573	ext	@ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
574	add	@ACC[1].2d,@ACC[1].2d,$temp.2d
575	ushr	$temp.2d,@ACC[1].2d,#16
576	zip1	@ACC[1].4h,$Temp.4h,@ACC[1].4h
577	ins	$temp.d[1],$zero.d[0]
578___
579	push(@ACC,shift(@ACC));
580}
581	push(@ACC,shift(@ACC));
582$code.=<<___;
583	ld1	{@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
584	subs	$inner,$inner,#8
585	st1	{@ACC[7].s}[0], [$toutptr],#4
586	bne	.LNEON_tail
587
588	st1	{$temp.s}[0], [$toutptr],#4	// top-most bit
589	sub	$nptr,$nptr,$num,lsl#2		// rewind $nptr
590	subs	$aptr,sp,#0			// clear carry flag
591	add	$bptr,sp,$num,lsl#2
592
593.LNEON_sub:
594	ldp	w4,w5,[$aptr],#8
595	ldp	w6,w7,[$aptr],#8
596	ldp	w8,w9,[$nptr],#8
597	ldp	w10,w11,[$nptr],#8
598	sbcs	w8,w4,w8
599	sbcs	w9,w5,w9
600	sbcs	w10,w6,w10
601	sbcs	w11,w7,w11
602	sub	x17,$bptr,$aptr
603	stp	w8,w9,[$rptr],#8
604	stp	w10,w11,[$rptr],#8
605	cbnz	x17,.LNEON_sub
606
607	ldr	w10, [$aptr]		// load top-most bit
608	mov	x11,sp
609	eor	v0.16b,v0.16b,v0.16b
610	sub	x11,$bptr,x11		// this is num*4
611	eor	v1.16b,v1.16b,v1.16b
612	mov	$aptr,sp
613	sub	$rptr,$rptr,x11		// rewind $rptr
614	mov	$nptr,$bptr		// second 3/4th of frame
615	sbcs	w10,w10,wzr		// result is carry flag
616
617.LNEON_copy_n_zap:
618	ldp	w4,w5,[$aptr],#8
619	ldp	w6,w7,[$aptr],#8
620	ldp	w8,w9,[$rptr],#8
621	ldp	w10,w11,[$rptr]
622	sub	$rptr,$rptr,#8
623	b.cs	.LCopy_1
624	mov	w8,w4
625	mov	w9,w5
626	mov	w10,w6
627	mov	w11,w7
628.LCopy_1:
629	st1	{v0.2d,v1.2d}, [$nptr],#32		// wipe
630	st1	{v0.2d,v1.2d}, [$nptr],#32		// wipe
631	ldp	w4,w5,[$aptr],#8
632	ldp	w6,w7,[$aptr],#8
633	stp	w8,w9,[$rptr],#8
634	stp	w10,w11,[$rptr],#8
635	sub	$aptr,$aptr,#32
636	ldp	w8,w9,[$rptr],#8
637	ldp	w10,w11,[$rptr]
638	sub	$rptr,$rptr,#8
639	b.cs	.LCopy_2
640	mov	w8, w4
641	mov	w9, w5
642	mov	w10, w6
643	mov	w11, w7
644.LCopy_2:
645	st1	{v0.2d,v1.2d}, [$aptr],#32		// wipe
646	st1	{v0.2d,v1.2d}, [$nptr],#32		// wipe
647	sub	x17,$bptr,$aptr		// preserves carry
648	stp	w8,w9,[$rptr],#8
649	stp	w10,w11,[$rptr],#8
650	cbnz	x17,.LNEON_copy_n_zap
651
652	mov	sp,x16
653	ldp	d14,d15,[sp,#64]
654	ldp	d12,d13,[sp,#48]
655	ldp	d10,d11,[sp,#32]
656	ldp	d8,d9,[sp,#16]
657	ldr	x29,[sp],#80
658	AARCH64_VALIDATE_LINK_REGISTER
659	ret			// bx lr
660
661.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
662___
663}
664{
665########################################################################
666# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
667
668my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
669my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
670my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
671my ($cnt,$carry,$topmost)=("x27","x28","x30");
672my ($tp,$ap_end,$na0)=($bp,$np,$carry);
673
674$code.=<<___;
675.type	__bn_sqr8x_mont,%function
676.align	5
677__bn_sqr8x_mont:
678	cmp	$ap,$bp
679	b.ne	__bn_mul4x_mont
680.Lsqr8x_mont:
681	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
682	// only from bn_mul_mont which has already signed the return address.
683	stp	x29,x30,[sp,#-128]!
684	add	x29,sp,#0
685	stp	x19,x20,[sp,#16]
686	stp	x21,x22,[sp,#32]
687	stp	x23,x24,[sp,#48]
688	stp	x25,x26,[sp,#64]
689	stp	x27,x28,[sp,#80]
690	stp	$rp,$np,[sp,#96]	// offload rp and np
691
692	ldp	$a0,$a1,[$ap,#8*0]
693	ldp	$a2,$a3,[$ap,#8*2]
694	ldp	$a4,$a5,[$ap,#8*4]
695	ldp	$a6,$a7,[$ap,#8*6]
696
697	sub	$tp,sp,$num,lsl#4
698	lsl	$num,$num,#3
699	ldr	$n0,[$n0]		// *n0
700	mov	sp,$tp			// alloca
701	sub	$cnt,$num,#8*8
702	b	.Lsqr8x_zero_start
703
704.Lsqr8x_zero:
705	sub	$cnt,$cnt,#8*8
706	stp	xzr,xzr,[$tp,#8*0]
707	stp	xzr,xzr,[$tp,#8*2]
708	stp	xzr,xzr,[$tp,#8*4]
709	stp	xzr,xzr,[$tp,#8*6]
710.Lsqr8x_zero_start:
711	stp	xzr,xzr,[$tp,#8*8]
712	stp	xzr,xzr,[$tp,#8*10]
713	stp	xzr,xzr,[$tp,#8*12]
714	stp	xzr,xzr,[$tp,#8*14]
715	add	$tp,$tp,#8*16
716	cbnz	$cnt,.Lsqr8x_zero
717
718	add	$ap_end,$ap,$num
719	add	$ap,$ap,#8*8
720	mov	$acc0,xzr
721	mov	$acc1,xzr
722	mov	$acc2,xzr
723	mov	$acc3,xzr
724	mov	$acc4,xzr
725	mov	$acc5,xzr
726	mov	$acc6,xzr
727	mov	$acc7,xzr
728	mov	$tp,sp
729	str	$n0,[x29,#112]		// offload n0
730
731	// Multiply everything but a[i]*a[i]
732.align	4
733.Lsqr8x_outer_loop:
734        //                                                 a[1]a[0]	(i)
735        //                                             a[2]a[0]
736        //                                         a[3]a[0]
737        //                                     a[4]a[0]
738        //                                 a[5]a[0]
739        //                             a[6]a[0]
740        //                         a[7]a[0]
741        //                                         a[2]a[1]		(ii)
742        //                                     a[3]a[1]
743        //                                 a[4]a[1]
744        //                             a[5]a[1]
745        //                         a[6]a[1]
746        //                     a[7]a[1]
747        //                                 a[3]a[2]			(iii)
748        //                             a[4]a[2]
749        //                         a[5]a[2]
750        //                     a[6]a[2]
751        //                 a[7]a[2]
752        //                         a[4]a[3]				(iv)
753        //                     a[5]a[3]
754        //                 a[6]a[3]
755        //             a[7]a[3]
756        //                 a[5]a[4]					(v)
757        //             a[6]a[4]
758        //         a[7]a[4]
759        //         a[6]a[5]						(vi)
760        //     a[7]a[5]
761        // a[7]a[6]							(vii)
762
763	mul	$t0,$a1,$a0		// lo(a[1..7]*a[0])		(i)
764	mul	$t1,$a2,$a0
765	mul	$t2,$a3,$a0
766	mul	$t3,$a4,$a0
767	adds	$acc1,$acc1,$t0		// t[1]+lo(a[1]*a[0])
768	mul	$t0,$a5,$a0
769	adcs	$acc2,$acc2,$t1
770	mul	$t1,$a6,$a0
771	adcs	$acc3,$acc3,$t2
772	mul	$t2,$a7,$a0
773	adcs	$acc4,$acc4,$t3
774	umulh	$t3,$a1,$a0		// hi(a[1..7]*a[0])
775	adcs	$acc5,$acc5,$t0
776	umulh	$t0,$a2,$a0
777	adcs	$acc6,$acc6,$t1
778	umulh	$t1,$a3,$a0
779	adcs	$acc7,$acc7,$t2
780	umulh	$t2,$a4,$a0
781	stp	$acc0,$acc1,[$tp],#8*2	// t[0..1]
782	adc	$acc0,xzr,xzr		// t[8]
783	adds	$acc2,$acc2,$t3		// t[2]+lo(a[1]*a[0])
784	umulh	$t3,$a5,$a0
785	adcs	$acc3,$acc3,$t0
786	umulh	$t0,$a6,$a0
787	adcs	$acc4,$acc4,$t1
788	umulh	$t1,$a7,$a0
789	adcs	$acc5,$acc5,$t2
790	 mul	$t2,$a2,$a1		// lo(a[2..7]*a[1])		(ii)
791	adcs	$acc6,$acc6,$t3
792	 mul	$t3,$a3,$a1
793	adcs	$acc7,$acc7,$t0
794	 mul	$t0,$a4,$a1
795	adc	$acc0,$acc0,$t1
796
797	mul	$t1,$a5,$a1
798	adds	$acc3,$acc3,$t2
799	mul	$t2,$a6,$a1
800	adcs	$acc4,$acc4,$t3
801	mul	$t3,$a7,$a1
802	adcs	$acc5,$acc5,$t0
803	umulh	$t0,$a2,$a1		// hi(a[2..7]*a[1])
804	adcs	$acc6,$acc6,$t1
805	umulh	$t1,$a3,$a1
806	adcs	$acc7,$acc7,$t2
807	umulh	$t2,$a4,$a1
808	adcs	$acc0,$acc0,$t3
809	umulh	$t3,$a5,$a1
810	stp	$acc2,$acc3,[$tp],#8*2	// t[2..3]
811	adc	$acc1,xzr,xzr		// t[9]
812	adds	$acc4,$acc4,$t0
813	umulh	$t0,$a6,$a1
814	adcs	$acc5,$acc5,$t1
815	umulh	$t1,$a7,$a1
816	adcs	$acc6,$acc6,$t2
817	 mul	$t2,$a3,$a2		// lo(a[3..7]*a[2])		(iii)
818	adcs	$acc7,$acc7,$t3
819	 mul	$t3,$a4,$a2
820	adcs	$acc0,$acc0,$t0
821	 mul	$t0,$a5,$a2
822	adc	$acc1,$acc1,$t1
823
824	mul	$t1,$a6,$a2
825	adds	$acc5,$acc5,$t2
826	mul	$t2,$a7,$a2
827	adcs	$acc6,$acc6,$t3
828	umulh	$t3,$a3,$a2		// hi(a[3..7]*a[2])
829	adcs	$acc7,$acc7,$t0
830	umulh	$t0,$a4,$a2
831	adcs	$acc0,$acc0,$t1
832	umulh	$t1,$a5,$a2
833	adcs	$acc1,$acc1,$t2
834	umulh	$t2,$a6,$a2
835	stp	$acc4,$acc5,[$tp],#8*2	// t[4..5]
836	adc	$acc2,xzr,xzr		// t[10]
837	adds	$acc6,$acc6,$t3
838	umulh	$t3,$a7,$a2
839	adcs	$acc7,$acc7,$t0
840	 mul	$t0,$a4,$a3		// lo(a[4..7]*a[3])		(iv)
841	adcs	$acc0,$acc0,$t1
842	 mul	$t1,$a5,$a3
843	adcs	$acc1,$acc1,$t2
844	 mul	$t2,$a6,$a3
845	adc	$acc2,$acc2,$t3
846
847	mul	$t3,$a7,$a3
848	adds	$acc7,$acc7,$t0
849	umulh	$t0,$a4,$a3		// hi(a[4..7]*a[3])
850	adcs	$acc0,$acc0,$t1
851	umulh	$t1,$a5,$a3
852	adcs	$acc1,$acc1,$t2
853	umulh	$t2,$a6,$a3
854	adcs	$acc2,$acc2,$t3
855	umulh	$t3,$a7,$a3
856	stp	$acc6,$acc7,[$tp],#8*2	// t[6..7]
857	adc	$acc3,xzr,xzr		// t[11]
858	adds	$acc0,$acc0,$t0
859	 mul	$t0,$a5,$a4		// lo(a[5..7]*a[4])		(v)
860	adcs	$acc1,$acc1,$t1
861	 mul	$t1,$a6,$a4
862	adcs	$acc2,$acc2,$t2
863	 mul	$t2,$a7,$a4
864	adc	$acc3,$acc3,$t3
865
866	umulh	$t3,$a5,$a4		// hi(a[5..7]*a[4])
867	adds	$acc1,$acc1,$t0
868	umulh	$t0,$a6,$a4
869	adcs	$acc2,$acc2,$t1
870	umulh	$t1,$a7,$a4
871	adcs	$acc3,$acc3,$t2
872	 mul	$t2,$a6,$a5		// lo(a[6..7]*a[5])		(vi)
873	adc	$acc4,xzr,xzr		// t[12]
874	adds	$acc2,$acc2,$t3
875	 mul	$t3,$a7,$a5
876	adcs	$acc3,$acc3,$t0
877	 umulh	$t0,$a6,$a5		// hi(a[6..7]*a[5])
878	adc	$acc4,$acc4,$t1
879
880	umulh	$t1,$a7,$a5
881	adds	$acc3,$acc3,$t2
882	 mul	$t2,$a7,$a6		// lo(a[7]*a[6])		(vii)
883	adcs	$acc4,$acc4,$t3
884	 umulh	$t3,$a7,$a6		// hi(a[7]*a[6])
885	adc	$acc5,xzr,xzr		// t[13]
886	adds	$acc4,$acc4,$t0
887	sub	$cnt,$ap_end,$ap	// done yet?
888	adc	$acc5,$acc5,$t1
889
890	adds	$acc5,$acc5,$t2
891	sub	$t0,$ap_end,$num	// rewinded ap
892	adc	$acc6,xzr,xzr		// t[14]
893	add	$acc6,$acc6,$t3
894
895	cbz	$cnt,.Lsqr8x_outer_break
896
897	mov	$n0,$a0
898	ldp	$a0,$a1,[$tp,#8*0]
899	ldp	$a2,$a3,[$tp,#8*2]
900	ldp	$a4,$a5,[$tp,#8*4]
901	ldp	$a6,$a7,[$tp,#8*6]
902	adds	$acc0,$acc0,$a0
903	adcs	$acc1,$acc1,$a1
904	ldp	$a0,$a1,[$ap,#8*0]
905	adcs	$acc2,$acc2,$a2
906	adcs	$acc3,$acc3,$a3
907	ldp	$a2,$a3,[$ap,#8*2]
908	adcs	$acc4,$acc4,$a4
909	adcs	$acc5,$acc5,$a5
910	ldp	$a4,$a5,[$ap,#8*4]
911	adcs	$acc6,$acc6,$a6
912	mov	$rp,$ap
913	adcs	$acc7,xzr,$a7
914	ldp	$a6,$a7,[$ap,#8*6]
915	add	$ap,$ap,#8*8
916	//adc	$carry,xzr,xzr		// moved below
917	mov	$cnt,#-8*8
918
919	//                                                         a[8]a[0]
920	//                                                     a[9]a[0]
921	//                                                 a[a]a[0]
922	//                                             a[b]a[0]
923	//                                         a[c]a[0]
924	//                                     a[d]a[0]
925	//                                 a[e]a[0]
926	//                             a[f]a[0]
927	//                                                     a[8]a[1]
928	//                         a[f]a[1]........................
929	//                                                 a[8]a[2]
930	//                     a[f]a[2]........................
931	//                                             a[8]a[3]
932	//                 a[f]a[3]........................
933	//                                         a[8]a[4]
934	//             a[f]a[4]........................
935	//                                     a[8]a[5]
936	//         a[f]a[5]........................
937	//                                 a[8]a[6]
938	//     a[f]a[6]........................
939	//                             a[8]a[7]
940	// a[f]a[7]........................
941.Lsqr8x_mul:
942	mul	$t0,$a0,$n0
943	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
944	mul	$t1,$a1,$n0
945	add	$cnt,$cnt,#8
946	mul	$t2,$a2,$n0
947	mul	$t3,$a3,$n0
948	adds	$acc0,$acc0,$t0
949	mul	$t0,$a4,$n0
950	adcs	$acc1,$acc1,$t1
951	mul	$t1,$a5,$n0
952	adcs	$acc2,$acc2,$t2
953	mul	$t2,$a6,$n0
954	adcs	$acc3,$acc3,$t3
955	mul	$t3,$a7,$n0
956	adcs	$acc4,$acc4,$t0
957	umulh	$t0,$a0,$n0
958	adcs	$acc5,$acc5,$t1
959	umulh	$t1,$a1,$n0
960	adcs	$acc6,$acc6,$t2
961	umulh	$t2,$a2,$n0
962	adcs	$acc7,$acc7,$t3
963	umulh	$t3,$a3,$n0
964	adc	$carry,$carry,xzr
965	str	$acc0,[$tp],#8
966	adds	$acc0,$acc1,$t0
967	umulh	$t0,$a4,$n0
968	adcs	$acc1,$acc2,$t1
969	umulh	$t1,$a5,$n0
970	adcs	$acc2,$acc3,$t2
971	umulh	$t2,$a6,$n0
972	adcs	$acc3,$acc4,$t3
973	umulh	$t3,$a7,$n0
974	ldr	$n0,[$rp,$cnt]
975	adcs	$acc4,$acc5,$t0
976	adcs	$acc5,$acc6,$t1
977	adcs	$acc6,$acc7,$t2
978	adcs	$acc7,$carry,$t3
979	//adc	$carry,xzr,xzr		// moved above
980	cbnz	$cnt,.Lsqr8x_mul
981					// note that carry flag is guaranteed
982					// to be zero at this point
983	cmp	$ap,$ap_end		// done yet?
984	b.eq	.Lsqr8x_break
985
986	ldp	$a0,$a1,[$tp,#8*0]
987	ldp	$a2,$a3,[$tp,#8*2]
988	ldp	$a4,$a5,[$tp,#8*4]
989	ldp	$a6,$a7,[$tp,#8*6]
990	adds	$acc0,$acc0,$a0
991	ldur	$n0,[$rp,#-8*8]
992	adcs	$acc1,$acc1,$a1
993	ldp	$a0,$a1,[$ap,#8*0]
994	adcs	$acc2,$acc2,$a2
995	adcs	$acc3,$acc3,$a3
996	ldp	$a2,$a3,[$ap,#8*2]
997	adcs	$acc4,$acc4,$a4
998	adcs	$acc5,$acc5,$a5
999	ldp	$a4,$a5,[$ap,#8*4]
1000	adcs	$acc6,$acc6,$a6
1001	mov	$cnt,#-8*8
1002	adcs	$acc7,$acc7,$a7
1003	ldp	$a6,$a7,[$ap,#8*6]
1004	add	$ap,$ap,#8*8
1005	//adc	$carry,xzr,xzr		// moved above
1006	b	.Lsqr8x_mul
1007
1008.align	4
1009.Lsqr8x_break:
1010	ldp	$a0,$a1,[$rp,#8*0]
1011	add	$ap,$rp,#8*8
1012	ldp	$a2,$a3,[$rp,#8*2]
1013	sub	$t0,$ap_end,$ap		// is it last iteration?
1014	ldp	$a4,$a5,[$rp,#8*4]
1015	sub	$t1,$tp,$t0
1016	ldp	$a6,$a7,[$rp,#8*6]
1017	cbz	$t0,.Lsqr8x_outer_loop
1018
1019	stp	$acc0,$acc1,[$tp,#8*0]
1020	ldp	$acc0,$acc1,[$t1,#8*0]
1021	stp	$acc2,$acc3,[$tp,#8*2]
1022	ldp	$acc2,$acc3,[$t1,#8*2]
1023	stp	$acc4,$acc5,[$tp,#8*4]
1024	ldp	$acc4,$acc5,[$t1,#8*4]
1025	stp	$acc6,$acc7,[$tp,#8*6]
1026	mov	$tp,$t1
1027	ldp	$acc6,$acc7,[$t1,#8*6]
1028	b	.Lsqr8x_outer_loop
1029
1030.align	4
1031.Lsqr8x_outer_break:
1032	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1033	ldp	$a1,$a3,[$t0,#8*0]	// recall that $t0 is &a[0]
1034	ldp	$t1,$t2,[sp,#8*1]
1035	ldp	$a5,$a7,[$t0,#8*2]
1036	add	$ap,$t0,#8*4
1037	ldp	$t3,$t0,[sp,#8*3]
1038
1039	stp	$acc0,$acc1,[$tp,#8*0]
1040	mul	$acc0,$a1,$a1
1041	stp	$acc2,$acc3,[$tp,#8*2]
1042	umulh	$a1,$a1,$a1
1043	stp	$acc4,$acc5,[$tp,#8*4]
1044	mul	$a2,$a3,$a3
1045	stp	$acc6,$acc7,[$tp,#8*6]
1046	mov	$tp,sp
1047	umulh	$a3,$a3,$a3
1048	adds	$acc1,$a1,$t1,lsl#1
1049	extr	$t1,$t2,$t1,#63
1050	sub	$cnt,$num,#8*4
1051
1052.Lsqr4x_shift_n_add:
1053	adcs	$acc2,$a2,$t1
1054	extr	$t2,$t3,$t2,#63
1055	sub	$cnt,$cnt,#8*4
1056	adcs	$acc3,$a3,$t2
1057	ldp	$t1,$t2,[$tp,#8*5]
1058	mul	$a4,$a5,$a5
1059	ldp	$a1,$a3,[$ap],#8*2
1060	umulh	$a5,$a5,$a5
1061	mul	$a6,$a7,$a7
1062	umulh	$a7,$a7,$a7
1063	extr	$t3,$t0,$t3,#63
1064	stp	$acc0,$acc1,[$tp,#8*0]
1065	adcs	$acc4,$a4,$t3
1066	extr	$t0,$t1,$t0,#63
1067	stp	$acc2,$acc3,[$tp,#8*2]
1068	adcs	$acc5,$a5,$t0
1069	ldp	$t3,$t0,[$tp,#8*7]
1070	extr	$t1,$t2,$t1,#63
1071	adcs	$acc6,$a6,$t1
1072	extr	$t2,$t3,$t2,#63
1073	adcs	$acc7,$a7,$t2
1074	ldp	$t1,$t2,[$tp,#8*9]
1075	mul	$a0,$a1,$a1
1076	ldp	$a5,$a7,[$ap],#8*2
1077	umulh	$a1,$a1,$a1
1078	mul	$a2,$a3,$a3
1079	umulh	$a3,$a3,$a3
1080	stp	$acc4,$acc5,[$tp,#8*4]
1081	extr	$t3,$t0,$t3,#63
1082	stp	$acc6,$acc7,[$tp,#8*6]
1083	add	$tp,$tp,#8*8
1084	adcs	$acc0,$a0,$t3
1085	extr	$t0,$t1,$t0,#63
1086	adcs	$acc1,$a1,$t0
1087	ldp	$t3,$t0,[$tp,#8*3]
1088	extr	$t1,$t2,$t1,#63
1089	cbnz	$cnt,.Lsqr4x_shift_n_add
1090___
1091my ($np,$np_end)=($ap,$ap_end);
1092$code.=<<___;
1093	 ldp	$np,$n0,[x29,#104]	// pull np and n0
1094
1095	adcs	$acc2,$a2,$t1
1096	extr	$t2,$t3,$t2,#63
1097	adcs	$acc3,$a3,$t2
1098	ldp	$t1,$t2,[$tp,#8*5]
1099	mul	$a4,$a5,$a5
1100	umulh	$a5,$a5,$a5
1101	stp	$acc0,$acc1,[$tp,#8*0]
1102	mul	$a6,$a7,$a7
1103	umulh	$a7,$a7,$a7
1104	stp	$acc2,$acc3,[$tp,#8*2]
1105	extr	$t3,$t0,$t3,#63
1106	adcs	$acc4,$a4,$t3
1107	extr	$t0,$t1,$t0,#63
1108	 ldp	$acc0,$acc1,[sp,#8*0]
1109	adcs	$acc5,$a5,$t0
1110	extr	$t1,$t2,$t1,#63
1111	 ldp	$a0,$a1,[$np,#8*0]
1112	adcs	$acc6,$a6,$t1
1113	extr	$t2,xzr,$t2,#63
1114	 ldp	$a2,$a3,[$np,#8*2]
1115	adc	$acc7,$a7,$t2
1116	 ldp	$a4,$a5,[$np,#8*4]
1117
1118	// Reduce by 512 bits per iteration
1119	mul	$na0,$n0,$acc0		// t[0]*n0
1120	ldp	$a6,$a7,[$np,#8*6]
1121	add	$np_end,$np,$num
1122	ldp	$acc2,$acc3,[sp,#8*2]
1123	stp	$acc4,$acc5,[$tp,#8*4]
1124	ldp	$acc4,$acc5,[sp,#8*4]
1125	stp	$acc6,$acc7,[$tp,#8*6]
1126	ldp	$acc6,$acc7,[sp,#8*6]
1127	add	$np,$np,#8*8
1128	mov	$topmost,xzr		// initial top-most carry
1129	mov	$tp,sp
1130	mov	$cnt,#8
1131
1132.Lsqr8x_reduction:
1133	// (*)	mul	$t0,$a0,$na0	// lo(n[0-7])*lo(t[0]*n0)
1134	mul	$t1,$a1,$na0
1135	sub	$cnt,$cnt,#1
1136	mul	$t2,$a2,$na0
1137	str	$na0,[$tp],#8		// put aside t[0]*n0 for tail processing
1138	mul	$t3,$a3,$na0
1139	// (*)	adds	xzr,$acc0,$t0
1140	subs	xzr,$acc0,#1		// (*)
1141	mul	$t0,$a4,$na0
1142	adcs	$acc0,$acc1,$t1
1143	mul	$t1,$a5,$na0
1144	adcs	$acc1,$acc2,$t2
1145	mul	$t2,$a6,$na0
1146	adcs	$acc2,$acc3,$t3
1147	mul	$t3,$a7,$na0
1148	adcs	$acc3,$acc4,$t0
1149	umulh	$t0,$a0,$na0		// hi(n[0-7])*lo(t[0]*n0)
1150	adcs	$acc4,$acc5,$t1
1151	umulh	$t1,$a1,$na0
1152	adcs	$acc5,$acc6,$t2
1153	umulh	$t2,$a2,$na0
1154	adcs	$acc6,$acc7,$t3
1155	umulh	$t3,$a3,$na0
1156	adc	$acc7,xzr,xzr
1157	adds	$acc0,$acc0,$t0
1158	umulh	$t0,$a4,$na0
1159	adcs	$acc1,$acc1,$t1
1160	umulh	$t1,$a5,$na0
1161	adcs	$acc2,$acc2,$t2
1162	umulh	$t2,$a6,$na0
1163	adcs	$acc3,$acc3,$t3
1164	umulh	$t3,$a7,$na0
1165	mul	$na0,$n0,$acc0		// next t[0]*n0
1166	adcs	$acc4,$acc4,$t0
1167	adcs	$acc5,$acc5,$t1
1168	adcs	$acc6,$acc6,$t2
1169	adc	$acc7,$acc7,$t3
1170	cbnz	$cnt,.Lsqr8x_reduction
1171
1172	ldp	$t0,$t1,[$tp,#8*0]
1173	ldp	$t2,$t3,[$tp,#8*2]
1174	mov	$rp,$tp
1175	sub	$cnt,$np_end,$np	// done yet?
1176	adds	$acc0,$acc0,$t0
1177	adcs	$acc1,$acc1,$t1
1178	ldp	$t0,$t1,[$tp,#8*4]
1179	adcs	$acc2,$acc2,$t2
1180	adcs	$acc3,$acc3,$t3
1181	ldp	$t2,$t3,[$tp,#8*6]
1182	adcs	$acc4,$acc4,$t0
1183	adcs	$acc5,$acc5,$t1
1184	adcs	$acc6,$acc6,$t2
1185	adcs	$acc7,$acc7,$t3
1186	//adc	$carry,xzr,xzr		// moved below
1187	cbz	$cnt,.Lsqr8x8_post_condition
1188
1189	ldur	$n0,[$tp,#-8*8]
1190	ldp	$a0,$a1,[$np,#8*0]
1191	ldp	$a2,$a3,[$np,#8*2]
1192	ldp	$a4,$a5,[$np,#8*4]
1193	mov	$cnt,#-8*8
1194	ldp	$a6,$a7,[$np,#8*6]
1195	add	$np,$np,#8*8
1196
1197.Lsqr8x_tail:
1198	mul	$t0,$a0,$n0
1199	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
1200	mul	$t1,$a1,$n0
1201	add	$cnt,$cnt,#8
1202	mul	$t2,$a2,$n0
1203	mul	$t3,$a3,$n0
1204	adds	$acc0,$acc0,$t0
1205	mul	$t0,$a4,$n0
1206	adcs	$acc1,$acc1,$t1
1207	mul	$t1,$a5,$n0
1208	adcs	$acc2,$acc2,$t2
1209	mul	$t2,$a6,$n0
1210	adcs	$acc3,$acc3,$t3
1211	mul	$t3,$a7,$n0
1212	adcs	$acc4,$acc4,$t0
1213	umulh	$t0,$a0,$n0
1214	adcs	$acc5,$acc5,$t1
1215	umulh	$t1,$a1,$n0
1216	adcs	$acc6,$acc6,$t2
1217	umulh	$t2,$a2,$n0
1218	adcs	$acc7,$acc7,$t3
1219	umulh	$t3,$a3,$n0
1220	adc	$carry,$carry,xzr
1221	str	$acc0,[$tp],#8
1222	adds	$acc0,$acc1,$t0
1223	umulh	$t0,$a4,$n0
1224	adcs	$acc1,$acc2,$t1
1225	umulh	$t1,$a5,$n0
1226	adcs	$acc2,$acc3,$t2
1227	umulh	$t2,$a6,$n0
1228	adcs	$acc3,$acc4,$t3
1229	umulh	$t3,$a7,$n0
1230	ldr	$n0,[$rp,$cnt]
1231	adcs	$acc4,$acc5,$t0
1232	adcs	$acc5,$acc6,$t1
1233	adcs	$acc6,$acc7,$t2
1234	adcs	$acc7,$carry,$t3
1235	//adc	$carry,xzr,xzr		// moved above
1236	cbnz	$cnt,.Lsqr8x_tail
1237					// note that carry flag is guaranteed
1238					// to be zero at this point
1239	ldp	$a0,$a1,[$tp,#8*0]
1240	sub	$cnt,$np_end,$np	// done yet?
1241	sub	$t2,$np_end,$num	// rewinded np
1242	ldp	$a2,$a3,[$tp,#8*2]
1243	ldp	$a4,$a5,[$tp,#8*4]
1244	ldp	$a6,$a7,[$tp,#8*6]
1245	cbz	$cnt,.Lsqr8x_tail_break
1246
1247	ldur	$n0,[$rp,#-8*8]
1248	adds	$acc0,$acc0,$a0
1249	adcs	$acc1,$acc1,$a1
1250	ldp	$a0,$a1,[$np,#8*0]
1251	adcs	$acc2,$acc2,$a2
1252	adcs	$acc3,$acc3,$a3
1253	ldp	$a2,$a3,[$np,#8*2]
1254	adcs	$acc4,$acc4,$a4
1255	adcs	$acc5,$acc5,$a5
1256	ldp	$a4,$a5,[$np,#8*4]
1257	adcs	$acc6,$acc6,$a6
1258	mov	$cnt,#-8*8
1259	adcs	$acc7,$acc7,$a7
1260	ldp	$a6,$a7,[$np,#8*6]
1261	add	$np,$np,#8*8
1262	//adc	$carry,xzr,xzr		// moved above
1263	b	.Lsqr8x_tail
1264
1265.align	4
1266.Lsqr8x_tail_break:
1267	ldr	$n0,[x29,#112]		// pull n0
1268	add	$cnt,$tp,#8*8		// end of current t[num] window
1269
1270	subs	xzr,$topmost,#1		// "move" top-most carry to carry bit
1271	adcs	$t0,$acc0,$a0
1272	adcs	$t1,$acc1,$a1
1273	ldp	$acc0,$acc1,[$rp,#8*0]
1274	adcs	$acc2,$acc2,$a2
1275	ldp	$a0,$a1,[$t2,#8*0]	// recall that $t2 is &n[0]
1276	adcs	$acc3,$acc3,$a3
1277	ldp	$a2,$a3,[$t2,#8*2]
1278	adcs	$acc4,$acc4,$a4
1279	adcs	$acc5,$acc5,$a5
1280	ldp	$a4,$a5,[$t2,#8*4]
1281	adcs	$acc6,$acc6,$a6
1282	adcs	$acc7,$acc7,$a7
1283	ldp	$a6,$a7,[$t2,#8*6]
1284	add	$np,$t2,#8*8
1285	adc	$topmost,xzr,xzr	// top-most carry
1286	mul	$na0,$n0,$acc0
1287	stp	$t0,$t1,[$tp,#8*0]
1288	stp	$acc2,$acc3,[$tp,#8*2]
1289	ldp	$acc2,$acc3,[$rp,#8*2]
1290	stp	$acc4,$acc5,[$tp,#8*4]
1291	ldp	$acc4,$acc5,[$rp,#8*4]
1292	cmp	$cnt,x29		// did we hit the bottom?
1293	stp	$acc6,$acc7,[$tp,#8*6]
1294	mov	$tp,$rp			// slide the window
1295	ldp	$acc6,$acc7,[$rp,#8*6]
1296	mov	$cnt,#8
1297	b.ne	.Lsqr8x_reduction
1298
1299	// Final step. We see if result is larger than modulus, and
1300	// if it is, subtract the modulus. But comparison implies
1301	// subtraction. So we subtract modulus, see if it borrowed,
1302	// and conditionally copy original value.
1303	ldr	$rp,[x29,#96]		// pull rp
1304	add	$tp,$tp,#8*8
1305	subs	$t0,$acc0,$a0
1306	sbcs	$t1,$acc1,$a1
1307	sub	$cnt,$num,#8*8
1308	mov	$ap_end,$rp		// $rp copy
1309
1310.Lsqr8x_sub:
1311	sbcs	$t2,$acc2,$a2
1312	ldp	$a0,$a1,[$np,#8*0]
1313	sbcs	$t3,$acc3,$a3
1314	stp	$t0,$t1,[$rp,#8*0]
1315	sbcs	$t0,$acc4,$a4
1316	ldp	$a2,$a3,[$np,#8*2]
1317	sbcs	$t1,$acc5,$a5
1318	stp	$t2,$t3,[$rp,#8*2]
1319	sbcs	$t2,$acc6,$a6
1320	ldp	$a4,$a5,[$np,#8*4]
1321	sbcs	$t3,$acc7,$a7
1322	ldp	$a6,$a7,[$np,#8*6]
1323	add	$np,$np,#8*8
1324	ldp	$acc0,$acc1,[$tp,#8*0]
1325	sub	$cnt,$cnt,#8*8
1326	ldp	$acc2,$acc3,[$tp,#8*2]
1327	ldp	$acc4,$acc5,[$tp,#8*4]
1328	ldp	$acc6,$acc7,[$tp,#8*6]
1329	add	$tp,$tp,#8*8
1330	stp	$t0,$t1,[$rp,#8*4]
1331	sbcs	$t0,$acc0,$a0
1332	stp	$t2,$t3,[$rp,#8*6]
1333	add	$rp,$rp,#8*8
1334	sbcs	$t1,$acc1,$a1
1335	cbnz	$cnt,.Lsqr8x_sub
1336
1337	sbcs	$t2,$acc2,$a2
1338	 mov	$tp,sp
1339	 add	$ap,sp,$num
1340	 ldp	$a0,$a1,[$ap_end,#8*0]
1341	sbcs	$t3,$acc3,$a3
1342	stp	$t0,$t1,[$rp,#8*0]
1343	sbcs	$t0,$acc4,$a4
1344	 ldp	$a2,$a3,[$ap_end,#8*2]
1345	sbcs	$t1,$acc5,$a5
1346	stp	$t2,$t3,[$rp,#8*2]
1347	sbcs	$t2,$acc6,$a6
1348	 ldp	$acc0,$acc1,[$ap,#8*0]
1349	sbcs	$t3,$acc7,$a7
1350	 ldp	$acc2,$acc3,[$ap,#8*2]
1351	sbcs	xzr,$topmost,xzr	// did it borrow?
1352	ldr	x30,[x29,#8]		// pull return address
1353	stp	$t0,$t1,[$rp,#8*4]
1354	stp	$t2,$t3,[$rp,#8*6]
1355
1356	sub	$cnt,$num,#8*4
1357.Lsqr4x_cond_copy:
1358	sub	$cnt,$cnt,#8*4
1359	csel	$t0,$acc0,$a0,lo
1360	 stp	xzr,xzr,[$tp,#8*0]
1361	csel	$t1,$acc1,$a1,lo
1362	ldp	$a0,$a1,[$ap_end,#8*4]
1363	ldp	$acc0,$acc1,[$ap,#8*4]
1364	csel	$t2,$acc2,$a2,lo
1365	 stp	xzr,xzr,[$tp,#8*2]
1366	 add	$tp,$tp,#8*4
1367	csel	$t3,$acc3,$a3,lo
1368	ldp	$a2,$a3,[$ap_end,#8*6]
1369	ldp	$acc2,$acc3,[$ap,#8*6]
1370	add	$ap,$ap,#8*4
1371	stp	$t0,$t1,[$ap_end,#8*0]
1372	stp	$t2,$t3,[$ap_end,#8*2]
1373	add	$ap_end,$ap_end,#8*4
1374	 stp	xzr,xzr,[$ap,#8*0]
1375	 stp	xzr,xzr,[$ap,#8*2]
1376	cbnz	$cnt,.Lsqr4x_cond_copy
1377
1378	csel	$t0,$acc0,$a0,lo
1379	 stp	xzr,xzr,[$tp,#8*0]
1380	csel	$t1,$acc1,$a1,lo
1381	 stp	xzr,xzr,[$tp,#8*2]
1382	csel	$t2,$acc2,$a2,lo
1383	csel	$t3,$acc3,$a3,lo
1384	stp	$t0,$t1,[$ap_end,#8*0]
1385	stp	$t2,$t3,[$ap_end,#8*2]
1386
1387	b	.Lsqr8x_done
1388
1389.align	4
1390.Lsqr8x8_post_condition:
1391	adc	$carry,xzr,xzr
1392	ldr	x30,[x29,#8]		// pull return address
1393	// $acc0-7,$carry hold result, $a0-7 hold modulus
1394	subs	$a0,$acc0,$a0
1395	ldr	$ap,[x29,#96]		// pull rp
1396	sbcs	$a1,$acc1,$a1
1397	 stp	xzr,xzr,[sp,#8*0]
1398	sbcs	$a2,$acc2,$a2
1399	 stp	xzr,xzr,[sp,#8*2]
1400	sbcs	$a3,$acc3,$a3
1401	 stp	xzr,xzr,[sp,#8*4]
1402	sbcs	$a4,$acc4,$a4
1403	 stp	xzr,xzr,[sp,#8*6]
1404	sbcs	$a5,$acc5,$a5
1405	 stp	xzr,xzr,[sp,#8*8]
1406	sbcs	$a6,$acc6,$a6
1407	 stp	xzr,xzr,[sp,#8*10]
1408	sbcs	$a7,$acc7,$a7
1409	 stp	xzr,xzr,[sp,#8*12]
1410	sbcs	$carry,$carry,xzr	// did it borrow?
1411	 stp	xzr,xzr,[sp,#8*14]
1412
1413	// $a0-7 hold result-modulus
1414	csel	$a0,$acc0,$a0,lo
1415	csel	$a1,$acc1,$a1,lo
1416	csel	$a2,$acc2,$a2,lo
1417	csel	$a3,$acc3,$a3,lo
1418	stp	$a0,$a1,[$ap,#8*0]
1419	csel	$a4,$acc4,$a4,lo
1420	csel	$a5,$acc5,$a5,lo
1421	stp	$a2,$a3,[$ap,#8*2]
1422	csel	$a6,$acc6,$a6,lo
1423	csel	$a7,$acc7,$a7,lo
1424	stp	$a4,$a5,[$ap,#8*4]
1425	stp	$a6,$a7,[$ap,#8*6]
1426
1427.Lsqr8x_done:
1428	ldp	x19,x20,[x29,#16]
1429	mov	sp,x29
1430	ldp	x21,x22,[x29,#32]
1431	mov	x0,#1
1432	ldp	x23,x24,[x29,#48]
1433	ldp	x25,x26,[x29,#64]
1434	ldp	x27,x28,[x29,#80]
1435	ldr	x29,[sp],#128
1436	// x30 is loaded earlier
1437	AARCH64_VALIDATE_LINK_REGISTER
1438	ret
1439.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
1440___
1441}
1442
1443{
1444########################################################################
1445# Even though this might look as ARMv8 adaptation of mulx4x_mont from
1446# x86_64-mont5 module, it's different in sense that it performs
1447# reduction 256 bits at a time.
1448
1449my ($a0,$a1,$a2,$a3,
1450    $t0,$t1,$t2,$t3,
1451    $m0,$m1,$m2,$m3,
1452    $acc0,$acc1,$acc2,$acc3,$acc4,
1453    $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1454my  $bp_end=$rp;
1455my  ($carry,$topmost) = ($rp,"x30");
1456
1457$code.=<<___;
1458.type	__bn_mul4x_mont,%function
1459.align	5
1460__bn_mul4x_mont:
1461	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
1462	// only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
1463	stp	x29,x30,[sp,#-128]!
1464	add	x29,sp,#0
1465	stp	x19,x20,[sp,#16]
1466	stp	x21,x22,[sp,#32]
1467	stp	x23,x24,[sp,#48]
1468	stp	x25,x26,[sp,#64]
1469	stp	x27,x28,[sp,#80]
1470
1471	sub	$tp,sp,$num,lsl#3
1472	lsl	$num,$num,#3
1473	ldr	$n0,[$n0]		// *n0
1474	sub	sp,$tp,#8*4		// alloca
1475
1476	add	$t0,$bp,$num
1477	add	$ap_end,$ap,$num
1478	stp	$rp,$t0,[x29,#96]	// offload rp and &b[num]
1479
1480	ldr	$bi,[$bp,#8*0]		// b[0]
1481	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
1482	ldp	$a2,$a3,[$ap,#8*2]
1483	add	$ap,$ap,#8*4
1484	mov	$acc0,xzr
1485	mov	$acc1,xzr
1486	mov	$acc2,xzr
1487	mov	$acc3,xzr
1488	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
1489	ldp	$m2,$m3,[$np,#8*2]
1490	adds	$np,$np,#8*4		// clear carry bit
1491	mov	$carry,xzr
1492	mov	$cnt,#0
1493	mov	$tp,sp
1494
1495.Loop_mul4x_1st_reduction:
1496	mul	$t0,$a0,$bi		// lo(a[0..3]*b[0])
1497	adc	$carry,$carry,xzr	// modulo-scheduled
1498	mul	$t1,$a1,$bi
1499	add	$cnt,$cnt,#8
1500	mul	$t2,$a2,$bi
1501	and	$cnt,$cnt,#31
1502	mul	$t3,$a3,$bi
1503	adds	$acc0,$acc0,$t0
1504	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[0])
1505	adcs	$acc1,$acc1,$t1
1506	mul	$mi,$acc0,$n0		// t[0]*n0
1507	adcs	$acc2,$acc2,$t2
1508	umulh	$t1,$a1,$bi
1509	adcs	$acc3,$acc3,$t3
1510	umulh	$t2,$a2,$bi
1511	adc	$acc4,xzr,xzr
1512	umulh	$t3,$a3,$bi
1513	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
1514	adds	$acc1,$acc1,$t0
1515	// (*)	mul	$t0,$m0,$mi	// lo(n[0..3]*t[0]*n0)
1516	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
1517	adcs	$acc2,$acc2,$t1
1518	mul	$t1,$m1,$mi
1519	adcs	$acc3,$acc3,$t2
1520	mul	$t2,$m2,$mi
1521	adc	$acc4,$acc4,$t3		// can't overflow
1522	mul	$t3,$m3,$mi
1523	// (*)	adds	xzr,$acc0,$t0
1524	subs	xzr,$acc0,#1		// (*)
1525	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0)
1526	adcs	$acc0,$acc1,$t1
1527	umulh	$t1,$m1,$mi
1528	adcs	$acc1,$acc2,$t2
1529	umulh	$t2,$m2,$mi
1530	adcs	$acc2,$acc3,$t3
1531	umulh	$t3,$m3,$mi
1532	adcs	$acc3,$acc4,$carry
1533	adc	$carry,xzr,xzr
1534	adds	$acc0,$acc0,$t0
1535	sub	$t0,$ap_end,$ap
1536	adcs	$acc1,$acc1,$t1
1537	adcs	$acc2,$acc2,$t2
1538	adcs	$acc3,$acc3,$t3
1539	//adc	$carry,$carry,xzr
1540	cbnz	$cnt,.Loop_mul4x_1st_reduction
1541
1542	cbz	$t0,.Lmul4x4_post_condition
1543
1544	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
1545	ldp	$a2,$a3,[$ap,#8*2]
1546	add	$ap,$ap,#8*4
1547	ldr	$mi,[sp]		// a[0]*n0
1548	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
1549	ldp	$m2,$m3,[$np,#8*2]
1550	add	$np,$np,#8*4
1551
1552.Loop_mul4x_1st_tail:
1553	mul	$t0,$a0,$bi		// lo(a[4..7]*b[i])
1554	adc	$carry,$carry,xzr	// modulo-scheduled
1555	mul	$t1,$a1,$bi
1556	add	$cnt,$cnt,#8
1557	mul	$t2,$a2,$bi
1558	and	$cnt,$cnt,#31
1559	mul	$t3,$a3,$bi
1560	adds	$acc0,$acc0,$t0
1561	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[i])
1562	adcs	$acc1,$acc1,$t1
1563	umulh	$t1,$a1,$bi
1564	adcs	$acc2,$acc2,$t2
1565	umulh	$t2,$a2,$bi
1566	adcs	$acc3,$acc3,$t3
1567	umulh	$t3,$a3,$bi
1568	adc	$acc4,xzr,xzr
1569	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
1570	adds	$acc1,$acc1,$t0
1571	mul	$t0,$m0,$mi		// lo(n[4..7]*a[0]*n0)
1572	adcs	$acc2,$acc2,$t1
1573	mul	$t1,$m1,$mi
1574	adcs	$acc3,$acc3,$t2
1575	mul	$t2,$m2,$mi
1576	adc	$acc4,$acc4,$t3		// can't overflow
1577	mul	$t3,$m3,$mi
1578	adds	$acc0,$acc0,$t0
1579	umulh	$t0,$m0,$mi		// hi(n[4..7]*a[0]*n0)
1580	adcs	$acc1,$acc1,$t1
1581	umulh	$t1,$m1,$mi
1582	adcs	$acc2,$acc2,$t2
1583	umulh	$t2,$m2,$mi
1584	adcs	$acc3,$acc3,$t3
1585	adcs	$acc4,$acc4,$carry
1586	umulh	$t3,$m3,$mi
1587	adc	$carry,xzr,xzr
1588	ldr	$mi,[sp,$cnt]		// next t[0]*n0
1589	str	$acc0,[$tp],#8		// result!!!
1590	adds	$acc0,$acc1,$t0
1591	sub	$t0,$ap_end,$ap		// done yet?
1592	adcs	$acc1,$acc2,$t1
1593	adcs	$acc2,$acc3,$t2
1594	adcs	$acc3,$acc4,$t3
1595	//adc	$carry,$carry,xzr
1596	cbnz	$cnt,.Loop_mul4x_1st_tail
1597
1598	sub	$t1,$ap_end,$num	// rewinded $ap
1599	cbz	$t0,.Lmul4x_proceed
1600
1601	ldp	$a0,$a1,[$ap,#8*0]
1602	ldp	$a2,$a3,[$ap,#8*2]
1603	add	$ap,$ap,#8*4
1604	ldp	$m0,$m1,[$np,#8*0]
1605	ldp	$m2,$m3,[$np,#8*2]
1606	add	$np,$np,#8*4
1607	b	.Loop_mul4x_1st_tail
1608
1609.align	5
1610.Lmul4x_proceed:
1611	ldr	$bi,[$bp,#8*4]!		// *++b
1612	adc	$topmost,$carry,xzr
1613	ldp	$a0,$a1,[$t1,#8*0]	// a[0..3]
1614	sub	$np,$np,$num		// rewind np
1615	ldp	$a2,$a3,[$t1,#8*2]
1616	add	$ap,$t1,#8*4
1617
1618	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
1619	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
1620	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
1621	ldp	$acc2,$acc3,[sp,#8*6]
1622
1623	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
1624	mov	$tp,sp
1625	ldp	$m2,$m3,[$np,#8*2]
1626	adds	$np,$np,#8*4		// clear carry bit
1627	mov	$carry,xzr
1628
1629.align	4
1630.Loop_mul4x_reduction:
1631	mul	$t0,$a0,$bi		// lo(a[0..3]*b[4])
1632	adc	$carry,$carry,xzr	// modulo-scheduled
1633	mul	$t1,$a1,$bi
1634	add	$cnt,$cnt,#8
1635	mul	$t2,$a2,$bi
1636	and	$cnt,$cnt,#31
1637	mul	$t3,$a3,$bi
1638	adds	$acc0,$acc0,$t0
1639	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[4])
1640	adcs	$acc1,$acc1,$t1
1641	mul	$mi,$acc0,$n0		// t[0]*n0
1642	adcs	$acc2,$acc2,$t2
1643	umulh	$t1,$a1,$bi
1644	adcs	$acc3,$acc3,$t3
1645	umulh	$t2,$a2,$bi
1646	adc	$acc4,xzr,xzr
1647	umulh	$t3,$a3,$bi
1648	ldr	$bi,[$bp,$cnt]		// next b[i]
1649	adds	$acc1,$acc1,$t0
1650	// (*)	mul	$t0,$m0,$mi
1651	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
1652	adcs	$acc2,$acc2,$t1
1653	mul	$t1,$m1,$mi		// lo(n[0..3]*t[0]*n0
1654	adcs	$acc3,$acc3,$t2
1655	mul	$t2,$m2,$mi
1656	adc	$acc4,$acc4,$t3		// can't overflow
1657	mul	$t3,$m3,$mi
1658	// (*)	adds	xzr,$acc0,$t0
1659	subs	xzr,$acc0,#1		// (*)
1660	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0
1661	adcs	$acc0,$acc1,$t1
1662	umulh	$t1,$m1,$mi
1663	adcs	$acc1,$acc2,$t2
1664	umulh	$t2,$m2,$mi
1665	adcs	$acc2,$acc3,$t3
1666	umulh	$t3,$m3,$mi
1667	adcs	$acc3,$acc4,$carry
1668	adc	$carry,xzr,xzr
1669	adds	$acc0,$acc0,$t0
1670	adcs	$acc1,$acc1,$t1
1671	adcs	$acc2,$acc2,$t2
1672	adcs	$acc3,$acc3,$t3
1673	//adc	$carry,$carry,xzr
1674	cbnz	$cnt,.Loop_mul4x_reduction
1675
1676	adc	$carry,$carry,xzr
1677	ldp	$t0,$t1,[$tp,#8*4]	// t[4..7]
1678	ldp	$t2,$t3,[$tp,#8*6]
1679	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
1680	ldp	$a2,$a3,[$ap,#8*2]
1681	add	$ap,$ap,#8*4
1682	adds	$acc0,$acc0,$t0
1683	adcs	$acc1,$acc1,$t1
1684	adcs	$acc2,$acc2,$t2
1685	adcs	$acc3,$acc3,$t3
1686	//adc	$carry,$carry,xzr
1687
1688	ldr	$mi,[sp]		// t[0]*n0
1689	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
1690	ldp	$m2,$m3,[$np,#8*2]
1691	add	$np,$np,#8*4
1692
1693.align	4
1694.Loop_mul4x_tail:
1695	mul	$t0,$a0,$bi		// lo(a[4..7]*b[4])
1696	adc	$carry,$carry,xzr	// modulo-scheduled
1697	mul	$t1,$a1,$bi
1698	add	$cnt,$cnt,#8
1699	mul	$t2,$a2,$bi
1700	and	$cnt,$cnt,#31
1701	mul	$t3,$a3,$bi
1702	adds	$acc0,$acc0,$t0
1703	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[4])
1704	adcs	$acc1,$acc1,$t1
1705	umulh	$t1,$a1,$bi
1706	adcs	$acc2,$acc2,$t2
1707	umulh	$t2,$a2,$bi
1708	adcs	$acc3,$acc3,$t3
1709	umulh	$t3,$a3,$bi
1710	adc	$acc4,xzr,xzr
1711	ldr	$bi,[$bp,$cnt]		// next b[i]
1712	adds	$acc1,$acc1,$t0
1713	mul	$t0,$m0,$mi		// lo(n[4..7]*t[0]*n0)
1714	adcs	$acc2,$acc2,$t1
1715	mul	$t1,$m1,$mi
1716	adcs	$acc3,$acc3,$t2
1717	mul	$t2,$m2,$mi
1718	adc	$acc4,$acc4,$t3		// can't overflow
1719	mul	$t3,$m3,$mi
1720	adds	$acc0,$acc0,$t0
1721	umulh	$t0,$m0,$mi		// hi(n[4..7]*t[0]*n0)
1722	adcs	$acc1,$acc1,$t1
1723	umulh	$t1,$m1,$mi
1724	adcs	$acc2,$acc2,$t2
1725	umulh	$t2,$m2,$mi
1726	adcs	$acc3,$acc3,$t3
1727	umulh	$t3,$m3,$mi
1728	adcs	$acc4,$acc4,$carry
1729	ldr	$mi,[sp,$cnt]		// next a[0]*n0
1730	adc	$carry,xzr,xzr
1731	str	$acc0,[$tp],#8		// result!!!
1732	adds	$acc0,$acc1,$t0
1733	sub	$t0,$ap_end,$ap		// done yet?
1734	adcs	$acc1,$acc2,$t1
1735	adcs	$acc2,$acc3,$t2
1736	adcs	$acc3,$acc4,$t3
1737	//adc	$carry,$carry,xzr
1738	cbnz	$cnt,.Loop_mul4x_tail
1739
1740	sub	$t1,$np,$num		// rewinded np?
1741	adc	$carry,$carry,xzr
1742	cbz	$t0,.Loop_mul4x_break
1743
1744	ldp	$t0,$t1,[$tp,#8*4]
1745	ldp	$t2,$t3,[$tp,#8*6]
1746	ldp	$a0,$a1,[$ap,#8*0]
1747	ldp	$a2,$a3,[$ap,#8*2]
1748	add	$ap,$ap,#8*4
1749	adds	$acc0,$acc0,$t0
1750	adcs	$acc1,$acc1,$t1
1751	adcs	$acc2,$acc2,$t2
1752	adcs	$acc3,$acc3,$t3
1753	//adc	$carry,$carry,xzr
1754	ldp	$m0,$m1,[$np,#8*0]
1755	ldp	$m2,$m3,[$np,#8*2]
1756	add	$np,$np,#8*4
1757	b	.Loop_mul4x_tail
1758
1759.align	4
1760.Loop_mul4x_break:
1761	ldp	$t2,$t3,[x29,#96]	// pull rp and &b[num]
1762	adds	$acc0,$acc0,$topmost
1763	add	$bp,$bp,#8*4		// bp++
1764	adcs	$acc1,$acc1,xzr
1765	sub	$ap,$ap,$num		// rewind ap
1766	adcs	$acc2,$acc2,xzr
1767	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
1768	adcs	$acc3,$acc3,xzr
1769	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
1770	adc	$topmost,$carry,xzr
1771	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
1772	cmp	$bp,$t3			// done yet?
1773	ldp	$acc2,$acc3,[sp,#8*6]
1774	ldp	$m0,$m1,[$t1,#8*0]	// n[0..3]
1775	ldp	$m2,$m3,[$t1,#8*2]
1776	add	$np,$t1,#8*4
1777	b.eq	.Lmul4x_post
1778
1779	ldr	$bi,[$bp]
1780	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
1781	ldp	$a2,$a3,[$ap,#8*2]
1782	adds	$ap,$ap,#8*4		// clear carry bit
1783	mov	$carry,xzr
1784	mov	$tp,sp
1785	b	.Loop_mul4x_reduction
1786
1787.align	4
1788.Lmul4x_post:
1789	// Final step. We see if result is larger than modulus, and
1790	// if it is, subtract the modulus. But comparison implies
1791	// subtraction. So we subtract modulus, see if it borrowed,
1792	// and conditionally copy original value.
1793	mov	$rp,$t2
1794	mov	$ap_end,$t2		// $rp copy
1795	subs	$t0,$acc0,$m0
1796	add	$tp,sp,#8*8
1797	sbcs	$t1,$acc1,$m1
1798	sub	$cnt,$num,#8*4
1799
1800.Lmul4x_sub:
1801	sbcs	$t2,$acc2,$m2
1802	ldp	$m0,$m1,[$np,#8*0]
1803	sub	$cnt,$cnt,#8*4
1804	ldp	$acc0,$acc1,[$tp,#8*0]
1805	sbcs	$t3,$acc3,$m3
1806	ldp	$m2,$m3,[$np,#8*2]
1807	add	$np,$np,#8*4
1808	ldp	$acc2,$acc3,[$tp,#8*2]
1809	add	$tp,$tp,#8*4
1810	stp	$t0,$t1,[$rp,#8*0]
1811	sbcs	$t0,$acc0,$m0
1812	stp	$t2,$t3,[$rp,#8*2]
1813	add	$rp,$rp,#8*4
1814	sbcs	$t1,$acc1,$m1
1815	cbnz	$cnt,.Lmul4x_sub
1816
1817	sbcs	$t2,$acc2,$m2
1818	 mov	$tp,sp
1819	 add	$ap,sp,#8*4
1820	 ldp	$a0,$a1,[$ap_end,#8*0]
1821	sbcs	$t3,$acc3,$m3
1822	stp	$t0,$t1,[$rp,#8*0]
1823	 ldp	$a2,$a3,[$ap_end,#8*2]
1824	stp	$t2,$t3,[$rp,#8*2]
1825	 ldp	$acc0,$acc1,[$ap,#8*0]
1826	 ldp	$acc2,$acc3,[$ap,#8*2]
1827	sbcs	xzr,$topmost,xzr	// did it borrow?
1828	ldr	x30,[x29,#8]		// pull return address
1829
1830	sub	$cnt,$num,#8*4
1831.Lmul4x_cond_copy:
1832	sub	$cnt,$cnt,#8*4
1833	csel	$t0,$acc0,$a0,lo
1834	 stp	xzr,xzr,[$tp,#8*0]
1835	csel	$t1,$acc1,$a1,lo
1836	ldp	$a0,$a1,[$ap_end,#8*4]
1837	ldp	$acc0,$acc1,[$ap,#8*4]
1838	csel	$t2,$acc2,$a2,lo
1839	 stp	xzr,xzr,[$tp,#8*2]
1840	 add	$tp,$tp,#8*4
1841	csel	$t3,$acc3,$a3,lo
1842	ldp	$a2,$a3,[$ap_end,#8*6]
1843	ldp	$acc2,$acc3,[$ap,#8*6]
1844	add	$ap,$ap,#8*4
1845	stp	$t0,$t1,[$ap_end,#8*0]
1846	stp	$t2,$t3,[$ap_end,#8*2]
1847	add	$ap_end,$ap_end,#8*4
1848	cbnz	$cnt,.Lmul4x_cond_copy
1849
1850	csel	$t0,$acc0,$a0,lo
1851	 stp	xzr,xzr,[$tp,#8*0]
1852	csel	$t1,$acc1,$a1,lo
1853	 stp	xzr,xzr,[$tp,#8*2]
1854	csel	$t2,$acc2,$a2,lo
1855	 stp	xzr,xzr,[$tp,#8*3]
1856	csel	$t3,$acc3,$a3,lo
1857	 stp	xzr,xzr,[$tp,#8*4]
1858	stp	$t0,$t1,[$ap_end,#8*0]
1859	stp	$t2,$t3,[$ap_end,#8*2]
1860
1861	b	.Lmul4x_done
1862
1863.align	4
1864.Lmul4x4_post_condition:
1865	adc	$carry,$carry,xzr
1866	ldr	$ap,[x29,#96]		// pull rp
1867	// $acc0-3,$carry hold result, $m0-7 hold modulus
1868	subs	$a0,$acc0,$m0
1869	ldr	x30,[x29,#8]		// pull return address
1870	sbcs	$a1,$acc1,$m1
1871	 stp	xzr,xzr,[sp,#8*0]
1872	sbcs	$a2,$acc2,$m2
1873	 stp	xzr,xzr,[sp,#8*2]
1874	sbcs	$a3,$acc3,$m3
1875	 stp	xzr,xzr,[sp,#8*4]
1876	sbcs	xzr,$carry,xzr		// did it borrow?
1877	 stp	xzr,xzr,[sp,#8*6]
1878
1879	// $a0-3 hold result-modulus
1880	csel	$a0,$acc0,$a0,lo
1881	csel	$a1,$acc1,$a1,lo
1882	csel	$a2,$acc2,$a2,lo
1883	csel	$a3,$acc3,$a3,lo
1884	stp	$a0,$a1,[$ap,#8*0]
1885	stp	$a2,$a3,[$ap,#8*2]
1886
1887.Lmul4x_done:
1888	ldp	x19,x20,[x29,#16]
1889	mov	sp,x29
1890	ldp	x21,x22,[x29,#32]
1891	mov	x0,#1
1892	ldp	x23,x24,[x29,#48]
1893	ldp	x25,x26,[x29,#64]
1894	ldp	x27,x28,[x29,#80]
1895	ldr	x29,[sp],#128
1896	// x30 loaded earlier
1897	AARCH64_VALIDATE_LINK_REGISTER
1898	ret
1899.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1900___
1901}
1902$code.=<<___;
1903.rodata
1904.asciz	"Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1905.align	4
1906___
1907
1908print $code;
1909
1910close STDOUT or die "error closing STDOUT: $!";
1911