xref: /openssl/crypto/ec/asm/ecp_sm2p256-armv8.pl (revision c6e65c1f)
1#! /usr/bin/env perl
2# Copyright 2023-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9# $output is the last argument if it looks like a file (it has an extension)
10# $flavour is the first argument if it doesn't look like a file
11$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
12$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
13
14$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
15( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
16( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
17die "can't locate arm-xlate.pl";
18
19open OUT,"| \"$^X\" $xlate $flavour \"$output\""
20    or die "can't call $xlate: $!";
21*STDOUT=*OUT;
22
23my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("x$_",(7..14));
24my ($a8,$a10,$a12,$a14,$a9,$a11,$a13,$a15)=map("x$_",(7..14));
25my ($t0,$t1,$t2,$t3)=map("x$_",(3..6));
26my ($t4,$t5,$t6,$t7,$t8)=map("x$_",(15..17,19,20));
27
28sub bn_mod_add() {
29	my $mod = shift;
30$code.=<<___;
31	// Load inputs
32	ldp $s0,$s1,[x1]
33	ldp $s2,$s3,[x1,#16]
34	ldp $s4,$s5,[x2]
35	ldp $s6,$s7,[x2,#16]
36
37	// Addition
38	adds $s0,$s0,$s4
39	adcs $s1,$s1,$s5
40	adcs $s2,$s2,$s6
41	adcs $s3,$s3,$s7
42	adc $t4,xzr,xzr
43
44	// Load polynomial
45	adrp x2,$mod
46	add x2,x2,:lo12:$mod
47	ldp $s4,$s5,[x2]
48	ldp $s6,$s7,[x2,#16]
49
50	// Backup Addition
51	mov $t0,$s0
52	mov $t1,$s1
53	mov $t2,$s2
54	mov $t3,$s3
55
56	// Sub polynomial
57	subs $t0,$t0,$s4
58	sbcs $t1,$t1,$s5
59	sbcs $t2,$t2,$s6
60	sbcs $t3,$t3,$s7
61	sbcs $t4,$t4,xzr
62
63	// Select based on carry
64	csel $s0,$s0,$t0,cc
65	csel $s1,$s1,$t1,cc
66	csel $s2,$s2,$t2,cc
67	csel $s3,$s3,$t3,cc
68
69	// Store results
70	stp $s0,$s1,[x0]
71	stp $s2,$s3,[x0,#16]
72___
73}
74
75sub bn_mod_sub() {
76	my $mod = shift;
77$code.=<<___;
78	// Load inputs
79	ldp $s0,$s1,[x1]
80	ldp $s2,$s3,[x1,#16]
81	ldp $s4,$s5,[x2]
82	ldp $s6,$s7,[x2,#16]
83
84	// Subtraction
85	subs $s0,$s0,$s4
86	sbcs $s1,$s1,$s5
87	sbcs $s2,$s2,$s6
88	sbcs $s3,$s3,$s7
89	sbc $t4,xzr,xzr
90
91	// Load polynomial
92	adrp x2,$mod
93	add x2,x2,:lo12:$mod
94	ldp $s4,$s5,[x2]
95	ldp $s6,$s7,[x2,#16]
96
97	// Backup subtraction
98	mov $t0,$s0
99	mov $t1,$s1
100	mov $t2,$s2
101	mov $t3,$s3
102
103	// Add polynomial
104	adds $t0,$t0,$s4
105	adcs $t1,$t1,$s5
106	adcs $t2,$t2,$s6
107	adcs $t3,$t3,$s7
108	tst $t4,$t4
109
110	// Select based on carry
111	csel $s0,$s0,$t0,eq
112	csel $s1,$s1,$t1,eq
113	csel $s2,$s2,$t2,eq
114	csel $s3,$s3,$t3,eq
115
116	// Store results
117	stp $s0,$s1,[x0]
118	stp $s2,$s3,[x0,#16]
119___
120}
121
122sub bn_mod_div_by_2() {
123	my $mod = shift;
124$code.=<<___;
125	// Load inputs
126	ldp $s0,$s1,[x1]
127	ldp $s2,$s3,[x1,#16]
128
129	// Save the least significant bit
130	mov $t0,$s0
131
132	// Right shift 1
133	extr $s0,$s1,$s0,#1
134	extr $s1,$s2,$s1,#1
135	extr $s2,$s3,$s2,#1
136	lsr $s3,$s3,#1
137
138	// Load mod
139	adrp x2,$mod
140	add x2,x2,:lo12:$mod
141	ldp $s4,$s5,[x2]
142	ldp $s6,$s7,[x2,#16]
143
144	// Parity check
145	tst $t0,#1
146	csel $s4,xzr,$s4,eq
147	csel $s5,xzr,$s5,eq
148	csel $s6,xzr,$s6,eq
149	csel $s7,xzr,$s7,eq
150
151	// Add
152	adds $s0,$s0,$s4
153	adcs $s1,$s1,$s5
154	adcs $s2,$s2,$s6
155	adc $s3,$s3,$s7
156
157	// Store results
158	stp $s0,$s1,[x0]
159	stp $s2,$s3,[x0,#16]
160___
161}
162
163{
164$code.=<<___;
165#include "arm_arch.h"
166.arch  armv8-a
167.rodata
168
169.align	5
170// The polynomial p
171.Lpoly:
172.quad	0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff
173// The order of polynomial n
174.Lord:
175.quad	0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff
176// (p + 1) / 2
177.Lpoly_div_2:
178.quad	0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff
179// (n + 1) / 2
180.Lord_div_2:
181.quad	0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff
182
183.text
184
185// void bn_rshift1(BN_ULONG *a);
186.globl	bn_rshift1
187.type	bn_rshift1,%function
188.align	5
189bn_rshift1:
190	AARCH64_VALID_CALL_TARGET
191	// Load inputs
192	ldp $s0,$s1,[x0]
193	ldp $s2,$s3,[x0,#16]
194
195	// Right shift
196	extr $s0,$s1,$s0,#1
197	extr $s1,$s2,$s1,#1
198	extr $s2,$s3,$s2,#1
199	lsr $s3,$s3,#1
200
201	// Store results
202	stp $s0,$s1,[x0]
203	stp $s2,$s3,[x0,#16]
204
205	ret
206.size bn_rshift1,.-bn_rshift1
207
208// void bn_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
209.globl	bn_sub
210.type	bn_sub,%function
211.align	5
212bn_sub:
213	AARCH64_VALID_CALL_TARGET
214	// Load inputs
215	ldp $s0,$s1,[x1]
216	ldp $s2,$s3,[x1,#16]
217	ldp $s4,$s5,[x2]
218	ldp $s6,$s7,[x2,#16]
219
220	// Subtraction
221	subs $s0,$s0,$s4
222	sbcs $s1,$s1,$s5
223	sbcs $s2,$s2,$s6
224	sbc $s3,$s3,$s7
225
226	// Store results
227	stp $s0,$s1,[x0]
228	stp $s2,$s3,[x0,#16]
229
230	ret
231.size bn_sub,.-bn_sub
232
233// void ecp_sm2p256_div_by_2(BN_ULONG *r,const BN_ULONG *a);
234.globl	ecp_sm2p256_div_by_2
235.type	ecp_sm2p256_div_by_2,%function
236.align	5
237ecp_sm2p256_div_by_2:
238	AARCH64_VALID_CALL_TARGET
239___
240	&bn_mod_div_by_2(".Lpoly_div_2");
241$code.=<<___;
242	ret
243.size ecp_sm2p256_div_by_2,.-ecp_sm2p256_div_by_2
244
245// void ecp_sm2p256_div_by_2_mod_ord(BN_ULONG *r,const BN_ULONG *a);
246.globl	ecp_sm2p256_div_by_2_mod_ord
247.type	ecp_sm2p256_div_by_2_mod_ord,%function
248.align	5
249ecp_sm2p256_div_by_2_mod_ord:
250	AARCH64_VALID_CALL_TARGET
251___
252	&bn_mod_div_by_2(".Lord_div_2");
253$code.=<<___;
254	ret
255.size ecp_sm2p256_div_by_2_mod_ord,.-ecp_sm2p256_div_by_2_mod_ord
256
257// void ecp_sm2p256_mul_by_3(BN_ULONG *r,const BN_ULONG *a);
258.globl	ecp_sm2p256_mul_by_3
259.type	ecp_sm2p256_mul_by_3,%function
260.align	5
261ecp_sm2p256_mul_by_3:
262	AARCH64_VALID_CALL_TARGET
263	// Load inputs
264	ldp $s0,$s1,[x1]
265	ldp $s2,$s3,[x1,#16]
266
267	// 2*a
268	adds $s0,$s0,$s0
269	adcs $s1,$s1,$s1
270	adcs $s2,$s2,$s2
271	adcs $s3,$s3,$s3
272	adcs $t4,xzr,xzr
273
274	mov $t0,$s0
275	mov $t1,$s1
276	mov $t2,$s2
277	mov $t3,$s3
278
279	// Sub polynomial
280	adrp x2,.Lpoly
281	add x2,x2,:lo12:.Lpoly
282	ldp $s4,$s5,[x2]
283	ldp $s6,$s7,[x2,#16]
284	subs $s0,$s0,$s4
285	sbcs $s1,$s1,$s5
286	sbcs $s2,$s2,$s6
287	sbcs $s3,$s3,$s7
288	sbcs $t4,$t4,xzr
289
290	csel $s0,$s0,$t0,cs
291	csel $s1,$s1,$t1,cs
292	csel $s2,$s2,$t2,cs
293	csel $s3,$s3,$t3,cs
294	eor $t4,$t4,$t4
295
296	// 3*a
297	ldp $s4,$s5,[x1]
298	ldp $s6,$s7,[x1,#16]
299	adds $s0,$s0,$s4
300	adcs $s1,$s1,$s5
301	adcs $s2,$s2,$s6
302	adcs $s3,$s3,$s7
303	adcs $t4,xzr,xzr
304
305	mov $t0,$s0
306	mov $t1,$s1
307	mov $t2,$s2
308	mov $t3,$s3
309
310	// Sub polynomial
311	adrp x2,.Lpoly
312	add x2,x2,:lo12:.Lpoly
313	ldp $s4,$s5,[x2]
314	ldp $s6,$s7,[x2,#16]
315	subs $s0,$s0,$s4
316	sbcs $s1,$s1,$s5
317	sbcs $s2,$s2,$s6
318	sbcs $s3,$s3,$s7
319	sbcs $t4,$t4,xzr
320
321	csel $s0,$s0,$t0,cs
322	csel $s1,$s1,$t1,cs
323	csel $s2,$s2,$t2,cs
324	csel $s3,$s3,$t3,cs
325
326	// Store results
327	stp $s0,$s1,[x0]
328	stp $s2,$s3,[x0,#16]
329
330	ret
331.size ecp_sm2p256_mul_by_3,.-ecp_sm2p256_mul_by_3
332
333// void ecp_sm2p256_add(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
334.globl	ecp_sm2p256_add
335.type	ecp_sm2p256_add,%function
336.align	5
337ecp_sm2p256_add:
338	AARCH64_VALID_CALL_TARGET
339___
340	&bn_mod_add(".Lpoly");
341$code.=<<___;
342	ret
343.size ecp_sm2p256_add,.-ecp_sm2p256_add
344
345// void ecp_sm2p256_sub(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
346.globl	ecp_sm2p256_sub
347.type	ecp_sm2p256_sub,%function
348.align	5
349ecp_sm2p256_sub:
350	AARCH64_VALID_CALL_TARGET
351___
352	&bn_mod_sub(".Lpoly");
353$code.=<<___;
354	ret
355.size ecp_sm2p256_sub,.-ecp_sm2p256_sub
356
357// void ecp_sm2p256_sub_mod_ord(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
358.globl	ecp_sm2p256_sub_mod_ord
359.type	ecp_sm2p256_sub_mod_ord,%function
360.align	5
361ecp_sm2p256_sub_mod_ord:
362	AARCH64_VALID_CALL_TARGET
363___
364	&bn_mod_sub(".Lord");
365$code.=<<___;
366	ret
367.size ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord
368
369.macro RDC
370	// a = |  s7   | ... | s0  |, where si are 64-bit quantities
371	//   = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities
372	// |    s7     |    s6     |    s5     |    s4     |
373	// | a15 | a14 | a13 | a12 | a11 | a10 | a9  | a8  |
374	// |    s3     |    s2     |    s1     |    s0     |
375	// | a7  | a6  | a5  | a4  | a3  | a2  | a1  | a0  |
376	// =================================================
377	// | a8  | a11 | a10 | a9  | a8  |   0 |    s4     | (+)
378	// | a9  | a15 |    s6     | a11 |   0 | a10 | a9  | (+)
379	// | a10 |   0 | a14 | a13 | a12 |   0 |    s5     | (+)
380	// | a11 |   0 |    s7     | a13 |   0 | a12 | a11 | (+)
381	// | a12 |   0 |    s7     | a13 |   0 |    s6     | (+)
382	// | a12 |   0 |   0 | a15 | a14 |   0 | a14 | a13 | (+)
383	// | a13 |   0 |   0 |   0 | a15 |   0 | a14 | a13 | (+)
384	// | a13 |   0 |   0 |   0 |   0 |   0 |    s7     | (+)
385	// | a14 |   0 |   0 |   0 |   0 |   0 |    s7     | (+)
386	// | a14 |   0 |   0 |   0 |   0 |   0 |   0 | a15 | (+)
387	// | a15 |   0 |   0 |   0 |   0 |   0 |   0 | a15 | (+)
388	// | a15 |   0 |   0 |   0 |   0 |   0 |   0 |   0 | (+)
389	// |    s7     |   0 |   0 |   0 |   0 |   0 |   0 | (+)
390	// |   0 |   0 |   0 |   0 |   0 | a8  |   0 |   0 | (-)
391	// |   0 |   0 |   0 |   0 |   0 | a9  |   0 |   0 | (-)
392	// |   0 |   0 |   0 |   0 |   0 | a13 |   0 |   0 | (-)
393	// |   0 |   0 |   0 |   0 |   0 | a14 |   0 |   0 | (-)
394	// | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]|
395	// |    V[3]   |    V[2]   |    V[1]   |    V[0]   |
396
397	// 1. 64-bit addition
398	// t2=s6+s7+s7
399	adds $t2,$s6,$s7
400	adcs $t1,xzr,xzr
401	adds $t2,$t2,$s7
402	adcs $t1,$t1,xzr
403	// t3=s4+s5+t2
404	adds $t3,$s4,$t2
405	adcs $t4,$t1,xzr
406	adds $t3,$t3,$s5
407	adcs $t4,$t4,xzr
408	// sum
409	adds $s0,$s0,$t3
410	adcs $s1,$s1,$t4
411	adcs $s2,$s2,$t2
412	adcs $s3,$s3,$s7
413	adcs $t0,xzr,xzr
414	adds $s3,$s3,$t1
415	adcs $t0,$t0,xzr
416
417	stp $s0,$s1,[sp,#32]
418	stp $s2,$s3,[sp,#48]
419
420	// 2. 64-bit to 32-bit spread
421	mov $t1,#0xffffffff
422	mov $s0,$s4
423	mov $s1,$s5
424	mov $s2,$s6
425	mov $s3,$s7
426	and $s0,$s0,$t1 // a8
427	and $s1,$s1,$t1 // a10
428	and $s2,$s2,$t1 // a12
429	and $s3,$s3,$t1 // a14
430	lsr $s4,$s4,#32 // a9
431	lsr $s5,$s5,#32 // a11
432	lsr $s6,$s6,#32 // a13
433	lsr $s7,$s7,#32 // a15
434
435	// 3. 32-bit addition
436	add $t1,$a14,$a12  // t1 <- a12 + a14
437	add $t2,$a15,$a13  // t2 <- a13 + a15
438	add $t3,$a8,$a9    // t3 <- a8 + a9
439	add $t4,$a14,$a10  // t4 <- a10 + a14
440	add $a15,$a15,$a11 // a15 <- a11 + a15
441	add $a12,$t2,$t1   // a12 <- a12 + a13 + a14 + a15
442	add $a10,$a10,$a12 // a10 <- a10 + a12 + a13 + a14 + a15
443	add $a10,$a10,$a12 // a10 <- a10 + 2*(a12 + a13 + a14 + a15)
444	add $a10,$a10,$t3  // a10 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15)
445	add $a10,$a10,$a11 // a10 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
446	add $a12,$a12,$a13 // a12 <- a12 + 2*a13 + a14 + a15
447	add $a12,$a12,$a11 // a12 <- a11 + a12 + 2*a13 + a14 + a15
448	add $a12,$a12,$a8  // a12 <- a8 + a11 + a12 + 2*a13 + a14 + a15
449	add $t3,$t3,$a14   // t3 <- a8 + a9 + a14
450	add $t3,$t3,$a13   // t3 <- a8 + a9 + a13 + a14
451	add $a9,$a9,$t2    // a9 <- a9 + a13 + a15
452	add $a11,$a11,$a9  // a11 <- a9 + a11 + a13 + a15
453	add $a11,$a11,$t2  // a11 <- a9 + a11 + 2*(a13 + a15)
454	add $t1,$t1,$t4    // t1 <- a10 + a12 + 2*a14
455
456	// U[0]  s5	a9 + a11 + 2*(a13 + a15)
457	// U[1]  t1	a10 + a12 + 2*a14
458	// U[2] -t3	a8 + a9 + a13 + a14
459	// U[3]  s2	a8 + a11 + a12 + 2*a13 + a14 + a15
460	// U[4]  s4	a9 + a13 + a15
461	// U[5]  t4	a10 + a14
462	// U[6]  s7	a11 + a15
463	// U[7]  s1	a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
464
465	// 4. 32-bit to 64-bit
466	lsl $s0,$t1,#32
467	extr $t1,$s2,$t1,#32
468	extr $s2,$t4,$s2,#32
469	extr $t4,$s1,$t4,#32
470	lsr $s1,$s1,#32
471
472	// 5. 64-bit addition
473	adds $s5,$s5,$s0
474	adcs $t1,$t1,xzr
475	adcs $s4,$s4,$s2
476	adcs $s7,$s7,$t4
477	adcs $t0,$t0,$s1
478
479	// V[0]	s5
480	// V[1]	t1
481	// V[2]	s4
482	// V[3]	s7
483	// carry	t0
484	// sub	t3
485
486	// 5. Process s0-s3
487	ldp $s0,$s1,[sp,#32]
488	ldp $s2,$s3,[sp,#48]
489	// add with V0-V3
490	adds $s0,$s0,$s5
491	adcs $s1,$s1,$t1
492	adcs $s2,$s2,$s4
493	adcs $s3,$s3,$s7
494	adcs $t0,$t0,xzr
495	// sub with t3
496	subs $s1,$s1,$t3
497	sbcs $s2,$s2,xzr
498	sbcs $s3,$s3,xzr
499	sbcs $t0,$t0,xzr
500
501	// 6. MOD
502	// First Mod
503	lsl $t1,$t0,#32
504	subs $t2,$t1,$t0
505
506	adds $s0,$s0,$t0
507	adcs $s1,$s1,$t2
508	adcs $s2,$s2,xzr
509	adcs $s3,$s3,$t1
510
511	// Last Mod
512	// return y - p if y > p else y
513	mov $s4,$s0
514	mov $s5,$s1
515	mov $s6,$s2
516	mov $s7,$s3
517
518	adrp $t0,.Lpoly
519	add $t0,$t0,:lo12:.Lpoly
520	ldp $t1,$t2,[$t0]
521	ldp $t3,$t4,[$t0,#16]
522
523	adcs $t5,xzr,xzr
524
525	subs $s0,$s0,$t1
526	sbcs $s1,$s1,$t2
527	sbcs $s2,$s2,$t3
528	sbcs $s3,$s3,$t4
529	sbcs $t5,$t5,xzr
530
531	csel $s0,$s0,$s4,cs
532	csel $s1,$s1,$s5,cs
533	csel $s2,$s2,$s6,cs
534	csel $s3,$s3,$s7,cs
535
536.endm
537
538// void ecp_sm2p256_mul(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
539.globl	ecp_sm2p256_mul
540.type	ecp_sm2p256_mul,%function
541.align	5
542ecp_sm2p256_mul:
543	AARCH64_SIGN_LINK_REGISTER
544	// Store scalar registers
545	stp x29,x30,[sp,#-80]!
546	add x29,sp,#0
547	stp x16,x17,[sp,#16]
548	stp x19,x20,[sp,#64]
549
550	// Load inputs
551	ldp $s0,$s1,[x1]
552	ldp $s2,$s3,[x1,#16]
553	ldp $s4,$s5,[x2]
554	ldp $s6,$s7,[x2,#16]
555
556// ### multiplication ###
557	// ========================
558	//             s3 s2 s1 s0
559	// *           s7 s6 s5 s4
560	// ------------------------
561	// +           s0 s0 s0 s0
562	//              *  *  *  *
563	//             s7 s6 s5 s4
564	//          s1 s1 s1 s1
565	//           *  *  *  *
566	//          s7 s6 s5 s4
567	//       s2 s2 s2 s2
568	//        *  *  *  *
569	//       s7 s6 s5 s4
570	//    s3 s3 s3 s3
571	//     *  *  *  *
572	//    s7 s6 s5 s4
573	// ------------------------
574	// s7 s6 s5 s4 s3 s2 s1 s0
575	// ========================
576
577// ### s0*s4 ###
578	mul $t5,$s0,$s4
579	umulh $t2,$s0,$s4
580
581// ### s1*s4 + s0*s5 ###
582	mul $t0,$s1,$s4
583	umulh $t1,$s1,$s4
584	adds $t2,$t2,$t0
585	adcs $t3,$t1,xzr
586
587	mul $t0,$s0,$s5
588	umulh $t1,$s0,$s5
589	adds $t2,$t2,$t0
590	adcs $t3,$t3,$t1
591	adcs $t4,xzr,xzr
592
593// ### s2*s4 + s1*s5 + s0*s6 ###
594	mul $t0,$s2,$s4
595	umulh $t1,$s2,$s4
596	adds $t3,$t3,$t0
597	adcs $t4,$t4,$t1
598
599	mul $t0,$s1,$s5
600	umulh $t1,$s1,$s5
601	adds $t3,$t3,$t0
602	adcs $t4,$t4,$t1
603	adcs $t6,xzr,xzr
604
605	mul $t0,$s0,$s6
606	umulh $t1,$s0,$s6
607	adds $t3,$t3,$t0
608	adcs $t4,$t4,$t1
609	adcs $t6,$t6,xzr
610
611// ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ###
612	mul $t0,$s3,$s4
613	umulh $t1,$s3,$s4
614	adds $t4,$t4,$t0
615	adcs $t6,$t6,$t1
616	adcs $t7,xzr,xzr
617
618	mul $t0,$s2,$s5
619	umulh $t1,$s2,$s5
620	adds $t4,$t4,$t0
621	adcs $t6,$t6,$t1
622	adcs $t7,$t7,xzr
623
624	mul $t0,$s1,$s6
625	umulh $t1,$s1,$s6
626	adds $t4,$t4,$t0
627	adcs $t6,$t6,$t1
628	adcs $t7,$t7,xzr
629
630	mul $t0,$s0,$s7
631	umulh $t1,$s0,$s7
632	adds $t4,$t4,$t0
633	adcs $t6,$t6,$t1
634	adcs $t7,$t7,xzr
635
636// ### s3*s5 + s2*s6 + s1*s7 ###
637	mul $t0,$s3,$s5
638	umulh $t1,$s3,$s5
639	adds $t6,$t6,$t0
640	adcs $t7,$t7,$t1
641	adcs $t8,xzr,xzr
642
643	mul $t0,$s2,$s6
644	umulh $t1,$s2,$s6
645	adds $t6,$t6,$t0
646	adcs $t7,$t7,$t1
647	adcs $t8,$t8,xzr
648
649	mul $t0,$s1,$s7
650	umulh $t1,$s1,$s7
651	adds $s4,$t6,$t0
652	adcs $t7,$t7,$t1
653	adcs $t8,$t8,xzr
654
655// ### s3*s6 + s2*s7 ###
656	mul $t0,$s3,$s6
657	umulh $t1,$s3,$s6
658	adds $t7,$t7,$t0
659	adcs $t8,$t8,$t1
660	adcs $t6,xzr,xzr
661
662	mul $t0,$s2,$s7
663	umulh $t1,$s2,$s7
664	adds $s5,$t7,$t0
665	adcs $t8,$t8,$t1
666	adcs $t6,$t6,xzr
667
668// ### s3*s7 ###
669	mul $t0,$s3,$s7
670	umulh $t1,$s3,$s7
671	adds $s6,$t8,$t0
672	adcs $s7,$t6,$t1
673
674	mov $s0,$t5
675	mov $s1,$t2
676	mov $s2,$t3
677	mov $s3,$t4
678
679	// result of mul: s7 s6 s5 s4 s3 s2 s1 s0
680
681// ### Reduction ###
682	RDC
683
684	stp $s0,$s1,[x0]
685	stp $s2,$s3,[x0,#16]
686
687	// Restore scalar registers
688	ldp x16,x17,[sp,#16]
689	ldp x19,x20,[sp,#64]
690	ldp x29,x30,[sp],#80
691
692	AARCH64_VALIDATE_LINK_REGISTER
693	ret
694.size ecp_sm2p256_mul,.-ecp_sm2p256_mul
695
696// void ecp_sm2p256_sqr(BN_ULONG *r, const BN_ULONG *a);
697.globl	ecp_sm2p256_sqr
698.type	ecp_sm2p256_sqr,%function
699.align	5
700
701ecp_sm2p256_sqr:
702	AARCH64_SIGN_LINK_REGISTER
703	// Store scalar registers
704	stp x29,x30,[sp,#-80]!
705	add x29,sp,#0
706	stp x16,x17,[sp,#16]
707	stp x19,x20,[sp,#64]
708
709	// Load inputs
710	ldp $s4,$s5,[x1]
711	ldp $s6,$s7,[x1,#16]
712
713// ### square ###
714	// ========================
715	//             s7 s6 s5 s4
716	// *           s7 s6 s5 s4
717	// ------------------------
718	// +           s4 s4 s4 s4
719	//              *  *  *  *
720	//             s7 s6 s5 s4
721	//          s5 s5 s5 s5
722	//           *  *  *  *
723	//          s7 s6 s5 s4
724	//       s6 s6 s6 s6
725	//        *  *  *  *
726	//       s7 s6 s5 s4
727	//    s7 s7 s7 s7
728	//     *  *  *  *
729	//    s7 s6 s5 s4
730	// ------------------------
731	// s7 s6 s5 s4 s3 s2 s1 s0
732	// ========================
733
734// ### s4*s5 ###
735	mul $s1,$s4,$s5
736	umulh $s2,$s4,$s5
737
738// ### s4*s6 ###
739	mul $t0,$s6,$s4
740	umulh $s3,$s6,$s4
741	adds $s2,$s2,$t0
742	adcs $s3,$s3,xzr
743
744// ### s4*s7 + s5*s6 ###
745	mul $t0,$s7,$s4
746	umulh $t1,$s7,$s4
747	adds $s3,$s3,$t0
748	adcs $s0,$t1,xzr
749
750	mul $t0,$s6,$s5
751	umulh $t1,$s6,$s5
752	adds $s3,$s3,$t0
753	adcs $s0,$s0,$t1
754	adcs $t2,xzr,xzr
755
756// ### s5*s7 ###
757	mul $t0,$s7,$s5
758	umulh $t1,$s7,$s5
759	adds $s0,$s0,$t0
760	adcs $t2,$t2,$t1
761
762// ### s6*s7 ###
763	mul $t0,$s7,$s6
764	umulh $t1,$s7,$s6
765	adds $t2,$t2,$t0
766	adcs $t3,$t1,xzr
767
768// ### 2*(t3,t2,s0,s3,s2,s1) ###
769	adds $s1,$s1,$s1
770	adcs $s2,$s2,$s2
771	adcs $s3,$s3,$s3
772	adcs $s0,$s0,$s0
773	adcs $t2,$t2,$t2
774	adcs $t3,$t3,$t3
775	adcs $t4,xzr,xzr
776
777// ### s4*s4 ###
778	mul $t5,$s4,$s4
779	umulh $t6,$s4,$s4
780
781// ### s5*s5 ###
782	mul $s4,$s5,$s5
783	umulh $s5,$s5,$s5
784
785// ### s6*s6 ###
786	mul $t0,$s6,$s6
787	umulh $t1,$s6,$s6
788
789// ### s7*s7 ###
790	mul $t7,$s7,$s7
791	umulh $t8,$s7,$s7
792
793	adds $s1,$s1,$t6
794	adcs $s2,$s2,$s4
795	adcs $s3,$s3,$s5
796	adcs $s0,$s0,$t0
797	adcs $t2,$t2,$t1
798	adcs $t3,$t3,$t7
799	adcs $t4,$t4,$t8
800
801	mov $s4,$s0
802	mov $s0,$t5
803	mov $s5,$t2
804	mov $s6,$t3
805	mov $s7,$t4
806
807	// result of mul: s7 s6 s5 s4 s3 s2 s1 s0
808
809// ### Reduction ###
810	RDC
811
812	stp $s0,$s1,[x0]
813	stp $s2,$s3,[x0,#16]
814
815	// Restore scalar registers
816	ldp x16,x17,[sp,#16]
817	ldp x19,x20,[sp,#64]
818	ldp x29,x30,[sp],#80
819
820	AARCH64_VALIDATE_LINK_REGISTER
821	ret
822.size ecp_sm2p256_sqr,.-ecp_sm2p256_sqr
823___
824}
825
826foreach (split("\n",$code)) {
827	s/\`([^\`]*)\`/eval $1/ge;
828
829	print $_,"\n";
830}
831close STDOUT or die "error closing STDOUT: $!";		# enforce flush
832