xref: /openssl/crypto/ec/asm/ecp_sm2p256-armv8.pl (revision b6461792)
1#! /usr/bin/env perl
2# Copyright 2023-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9# $output is the last argument if it looks like a file (it has an extension)
10# $flavour is the first argument if it doesn't look like a file
11$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
12$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
13
14$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
15( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
16( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
17die "can't locate arm-xlate.pl";
18
19open OUT,"| \"$^X\" $xlate $flavour \"$output\""
20    or die "can't call $xlate: $!";
21*STDOUT=*OUT;
22
23my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("x$_",(7..14));
24my ($a8,$a10,$a12,$a14,$a9,$a11,$a13,$a15)=map("x$_",(7..14));
25my ($t0,$t1,$t2,$t3)=map("x$_",(3..6));
26my ($t4,$t5,$t6,$t7,$t8)=map("x$_",(15..17,19,20));
27
28sub bn_mod_add() {
29	my $mod = shift;
30$code.=<<___;
31	// Load inputs
32	ldp $s0,$s1,[x1]
33	ldp $s2,$s3,[x1,#16]
34	ldp $s4,$s5,[x2]
35	ldp $s6,$s7,[x2,#16]
36
37	// Addition
38	adds $s0,$s0,$s4
39	adcs $s1,$s1,$s5
40	adcs $s2,$s2,$s6
41	adcs $s3,$s3,$s7
42	adc $t4,xzr,xzr
43
44	// Load polynomial
45	adr x2,$mod
46	ldp $s4,$s5,[x2]
47	ldp $s6,$s7,[x2,#16]
48
49	// Backup Addition
50	mov $t0,$s0
51	mov $t1,$s1
52	mov $t2,$s2
53	mov $t3,$s3
54
55	// Sub polynomial
56	subs $t0,$t0,$s4
57	sbcs $t1,$t1,$s5
58	sbcs $t2,$t2,$s6
59	sbcs $t3,$t3,$s7
60	sbcs $t4,$t4,xzr
61
62	// Select based on carry
63	csel $s0,$s0,$t0,cc
64	csel $s1,$s1,$t1,cc
65	csel $s2,$s2,$t2,cc
66	csel $s3,$s3,$t3,cc
67
68	// Store results
69	stp $s0,$s1,[x0]
70	stp $s2,$s3,[x0,#16]
71___
72}
73
74sub bn_mod_sub() {
75	my $mod = shift;
76$code.=<<___;
77	// Load inputs
78	ldp $s0,$s1,[x1]
79	ldp $s2,$s3,[x1,#16]
80	ldp $s4,$s5,[x2]
81	ldp $s6,$s7,[x2,#16]
82
83	// Subtraction
84	subs $s0,$s0,$s4
85	sbcs $s1,$s1,$s5
86	sbcs $s2,$s2,$s6
87	sbcs $s3,$s3,$s7
88	sbc $t4,xzr,xzr
89
90	// Load polynomial
91	adr x2,$mod
92	ldp $s4,$s5,[x2]
93	ldp $s6,$s7,[x2,#16]
94
95	// Backup subtraction
96	mov $t0,$s0
97	mov $t1,$s1
98	mov $t2,$s2
99	mov $t3,$s3
100
101	// Add polynomial
102	adds $t0,$t0,$s4
103	adcs $t1,$t1,$s5
104	adcs $t2,$t2,$s6
105	adcs $t3,$t3,$s7
106	tst $t4,$t4
107
108	// Select based on carry
109	csel $s0,$s0,$t0,eq
110	csel $s1,$s1,$t1,eq
111	csel $s2,$s2,$t2,eq
112	csel $s3,$s3,$t3,eq
113
114	// Store results
115	stp $s0,$s1,[x0]
116	stp $s2,$s3,[x0,#16]
117___
118}
119
120sub bn_mod_div_by_2() {
121	my $mod = shift;
122$code.=<<___;
123	// Load inputs
124	ldp $s0,$s1,[x1]
125	ldp $s2,$s3,[x1,#16]
126
127	// Save the least significant bit
128	mov $t0,$s0
129
130	// Right shift 1
131	extr $s0,$s1,$s0,#1
132	extr $s1,$s2,$s1,#1
133	extr $s2,$s3,$s2,#1
134	lsr $s3,$s3,#1
135
136	// Load mod
137	adr x2,$mod
138	ldp $s4,$s5,[x2]
139	ldp $s6,$s7,[x2,#16]
140
141	// Parity check
142	tst $t0,#1
143	csel $s4,xzr,$s4,eq
144	csel $s5,xzr,$s5,eq
145	csel $s6,xzr,$s6,eq
146	csel $s7,xzr,$s7,eq
147
148	// Add
149	adds $s0,$s0,$s4
150	adcs $s1,$s1,$s5
151	adcs $s2,$s2,$s6
152	adc $s3,$s3,$s7
153
154	// Store results
155	stp $s0,$s1,[x0]
156	stp $s2,$s3,[x0,#16]
157___
158}
159
160{
161$code.=<<___;
162#include "arm_arch.h"
163.arch  armv8-a
164.text
165
166.align	5
167// The polynomial p
168.Lpoly:
169.quad	0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff
170// The order of polynomial n
171.Lord:
172.quad	0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff
173// (p + 1) / 2
174.Lpoly_div_2:
175.quad	0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff
176// (n + 1) / 2
177.Lord_div_2:
178.quad	0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff
179
180// void bn_rshift1(BN_ULONG *a);
181.globl	bn_rshift1
182.type	bn_rshift1,%function
183.align	5
184bn_rshift1:
185	AARCH64_VALID_CALL_TARGET
186	// Load inputs
187	ldp $s0,$s1,[x0]
188	ldp $s2,$s3,[x0,#16]
189
190	// Right shift
191	extr $s0,$s1,$s0,#1
192	extr $s1,$s2,$s1,#1
193	extr $s2,$s3,$s2,#1
194	lsr $s3,$s3,#1
195
196	// Store results
197	stp $s0,$s1,[x0]
198	stp $s2,$s3,[x0,#16]
199
200	ret
201.size bn_rshift1,.-bn_rshift1
202
203// void bn_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
204.globl	bn_sub
205.type	bn_sub,%function
206.align	5
207bn_sub:
208	AARCH64_VALID_CALL_TARGET
209	// Load inputs
210	ldp $s0,$s1,[x1]
211	ldp $s2,$s3,[x1,#16]
212	ldp $s4,$s5,[x2]
213	ldp $s6,$s7,[x2,#16]
214
215	// Subtraction
216	subs $s0,$s0,$s4
217	sbcs $s1,$s1,$s5
218	sbcs $s2,$s2,$s6
219	sbc $s3,$s3,$s7
220
221	// Store results
222	stp $s0,$s1,[x0]
223	stp $s2,$s3,[x0,#16]
224
225	ret
226.size bn_sub,.-bn_sub
227
228// void ecp_sm2p256_div_by_2(BN_ULONG *r,const BN_ULONG *a);
229.globl	ecp_sm2p256_div_by_2
230.type	ecp_sm2p256_div_by_2,%function
231.align	5
232ecp_sm2p256_div_by_2:
233	AARCH64_VALID_CALL_TARGET
234___
235	&bn_mod_div_by_2(".Lpoly_div_2");
236$code.=<<___;
237	ret
238.size ecp_sm2p256_div_by_2,.-ecp_sm2p256_div_by_2
239
240// void ecp_sm2p256_div_by_2_mod_ord(BN_ULONG *r,const BN_ULONG *a);
241.globl	ecp_sm2p256_div_by_2_mod_ord
242.type	ecp_sm2p256_div_by_2_mod_ord,%function
243.align	5
244ecp_sm2p256_div_by_2_mod_ord:
245	AARCH64_VALID_CALL_TARGET
246___
247	&bn_mod_div_by_2(".Lord_div_2");
248$code.=<<___;
249	ret
250.size ecp_sm2p256_div_by_2_mod_ord,.-ecp_sm2p256_div_by_2_mod_ord
251
252// void ecp_sm2p256_mul_by_3(BN_ULONG *r,const BN_ULONG *a);
253.globl	ecp_sm2p256_mul_by_3
254.type	ecp_sm2p256_mul_by_3,%function
255.align	5
256ecp_sm2p256_mul_by_3:
257	AARCH64_VALID_CALL_TARGET
258	// Load inputs
259	ldp $s0,$s1,[x1]
260	ldp $s2,$s3,[x1,#16]
261
262	// 2*a
263	adds $s0,$s0,$s0
264	adcs $s1,$s1,$s1
265	adcs $s2,$s2,$s2
266	adcs $s3,$s3,$s3
267	adcs $t4,xzr,xzr
268
269	mov $t0,$s0
270	mov $t1,$s1
271	mov $t2,$s2
272	mov $t3,$s3
273
274	// Sub polynomial
275	adr x2,.Lpoly
276	ldp $s4,$s5,[x2]
277	ldp $s6,$s7,[x2,#16]
278	subs $s0,$s0,$s4
279	sbcs $s1,$s1,$s5
280	sbcs $s2,$s2,$s6
281	sbcs $s3,$s3,$s7
282	sbcs $t4,$t4,xzr
283
284	csel $s0,$s0,$t0,cs
285	csel $s1,$s1,$t1,cs
286	csel $s2,$s2,$t2,cs
287	csel $s3,$s3,$t3,cs
288	eor $t4,$t4,$t4
289
290	// 3*a
291	ldp $s4,$s5,[x1]
292	ldp $s6,$s7,[x1,#16]
293	adds $s0,$s0,$s4
294	adcs $s1,$s1,$s5
295	adcs $s2,$s2,$s6
296	adcs $s3,$s3,$s7
297	adcs $t4,xzr,xzr
298
299	mov $t0,$s0
300	mov $t1,$s1
301	mov $t2,$s2
302	mov $t3,$s3
303
304	// Sub polynomial
305	adr x2,.Lpoly
306	ldp $s4,$s5,[x2]
307	ldp $s6,$s7,[x2,#16]
308	subs $s0,$s0,$s4
309	sbcs $s1,$s1,$s5
310	sbcs $s2,$s2,$s6
311	sbcs $s3,$s3,$s7
312	sbcs $t4,$t4,xzr
313
314	csel $s0,$s0,$t0,cs
315	csel $s1,$s1,$t1,cs
316	csel $s2,$s2,$t2,cs
317	csel $s3,$s3,$t3,cs
318
319	// Store results
320	stp $s0,$s1,[x0]
321	stp $s2,$s3,[x0,#16]
322
323	ret
324.size ecp_sm2p256_mul_by_3,.-ecp_sm2p256_mul_by_3
325
326// void ecp_sm2p256_add(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
327.globl	ecp_sm2p256_add
328.type	ecp_sm2p256_add,%function
329.align	5
330ecp_sm2p256_add:
331	AARCH64_VALID_CALL_TARGET
332___
333	&bn_mod_add(".Lpoly");
334$code.=<<___;
335	ret
336.size ecp_sm2p256_add,.-ecp_sm2p256_add
337
338// void ecp_sm2p256_sub(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
339.globl	ecp_sm2p256_sub
340.type	ecp_sm2p256_sub,%function
341.align	5
342ecp_sm2p256_sub:
343	AARCH64_VALID_CALL_TARGET
344___
345	&bn_mod_sub(".Lpoly");
346$code.=<<___;
347	ret
348.size ecp_sm2p256_sub,.-ecp_sm2p256_sub
349
350// void ecp_sm2p256_sub_mod_ord(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
351.globl	ecp_sm2p256_sub_mod_ord
352.type	ecp_sm2p256_sub_mod_ord,%function
353.align	5
354ecp_sm2p256_sub_mod_ord:
355	AARCH64_VALID_CALL_TARGET
356___
357	&bn_mod_sub(".Lord");
358$code.=<<___;
359	ret
360.size ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord
361
362.macro RDC
363	// a = |  s7   | ... | s0  |, where si are 64-bit quantities
364	//   = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities
365	// |    s7     |    s6     |    s5     |    s4     |
366	// | a15 | a14 | a13 | a12 | a11 | a10 | a9  | a8  |
367	// |    s3     |    s2     |    s1     |    s0     |
368	// | a7  | a6  | a5  | a4  | a3  | a2  | a1  | a0  |
369	// =================================================
370	// | a8  | a11 | a10 | a9  | a8  |   0 |    s4     | (+)
371	// | a9  | a15 |    s6     | a11 |   0 | a10 | a9  | (+)
372	// | a10 |   0 | a14 | a13 | a12 |   0 |    s5     | (+)
373	// | a11 |   0 |    s7     | a13 |   0 | a12 | a11 | (+)
374	// | a12 |   0 |    s7     | a13 |   0 |    s6     | (+)
375	// | a12 |   0 |   0 | a15 | a14 |   0 | a14 | a13 | (+)
376	// | a13 |   0 |   0 |   0 | a15 |   0 | a14 | a13 | (+)
377	// | a13 |   0 |   0 |   0 |   0 |   0 |    s7     | (+)
378	// | a14 |   0 |   0 |   0 |   0 |   0 |    s7     | (+)
379	// | a14 |   0 |   0 |   0 |   0 |   0 |   0 | a15 | (+)
380	// | a15 |   0 |   0 |   0 |   0 |   0 |   0 | a15 | (+)
381	// | a15 |   0 |   0 |   0 |   0 |   0 |   0 |   0 | (+)
382	// |    s7     |   0 |   0 |   0 |   0 |   0 |   0 | (+)
383	// |   0 |   0 |   0 |   0 |   0 | a8  |   0 |   0 | (-)
384	// |   0 |   0 |   0 |   0 |   0 | a9  |   0 |   0 | (-)
385	// |   0 |   0 |   0 |   0 |   0 | a13 |   0 |   0 | (-)
386	// |   0 |   0 |   0 |   0 |   0 | a14 |   0 |   0 | (-)
387	// | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]|
388	// |    V[3]   |    V[2]   |    V[1]   |    V[0]   |
389
390	// 1. 64-bit addition
391	// t2=s6+s7+s7
392	adds $t2,$s6,$s7
393	adcs $t1,xzr,xzr
394	adds $t2,$t2,$s7
395	adcs $t1,$t1,xzr
396	// t3=s4+s5+t2
397	adds $t3,$s4,$t2
398	adcs $t4,$t1,xzr
399	adds $t3,$t3,$s5
400	adcs $t4,$t4,xzr
401	// sum
402	adds $s0,$s0,$t3
403	adcs $s1,$s1,$t4
404	adcs $s2,$s2,$t2
405	adcs $s3,$s3,$s7
406	adcs $t0,xzr,xzr
407	adds $s3,$s3,$t1
408	adcs $t0,$t0,xzr
409
410	stp $s0,$s1,[sp,#32]
411	stp $s2,$s3,[sp,#48]
412
413	// 2. 64-bit to 32-bit spread
414	mov $t1,#0xffffffff
415	mov $s0,$s4
416	mov $s1,$s5
417	mov $s2,$s6
418	mov $s3,$s7
419	and $s0,$s0,$t1 // a8
420	and $s1,$s1,$t1 // a10
421	and $s2,$s2,$t1 // a12
422	and $s3,$s3,$t1 // a14
423	lsr $s4,$s4,#32 // a9
424	lsr $s5,$s5,#32 // a11
425	lsr $s6,$s6,#32 // a13
426	lsr $s7,$s7,#32 // a15
427
428	// 3. 32-bit addition
429	add $t1,$a14,$a12  // t1 <- a12 + a14
430	add $t2,$a15,$a13  // t2 <- a13 + a15
431	add $t3,$a8,$a9    // t3 <- a8 + a9
432	add $t4,$a14,$a10  // t4 <- a10 + a14
433	add $a15,$a15,$a11 // a15 <- a11 + a15
434	add $a12,$t2,$t1   // a12 <- a12 + a13 + a14 + a15
435	add $a10,$a10,$a12 // a10 <- a10 + a12 + a13 + a14 + a15
436	add $a10,$a10,$a12 // a10 <- a10 + 2*(a12 + a13 + a14 + a15)
437	add $a10,$a10,$t3  // a10 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15)
438	add $a10,$a10,$a11 // a10 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
439	add $a12,$a12,$a13 // a12 <- a12 + 2*a13 + a14 + a15
440	add $a12,$a12,$a11 // a12 <- a11 + a12 + 2*a13 + a14 + a15
441	add $a12,$a12,$a8  // a12 <- a8 + a11 + a12 + 2*a13 + a14 + a15
442	add $t3,$t3,$a14   // t3 <- a8 + a9 + a14
443	add $t3,$t3,$a13   // t3 <- a8 + a9 + a13 + a14
444	add $a9,$a9,$t2    // a9 <- a9 + a13 + a15
445	add $a11,$a11,$a9  // a11 <- a9 + a11 + a13 + a15
446	add $a11,$a11,$t2  // a11 <- a9 + a11 + 2*(a13 + a15)
447	add $t1,$t1,$t4    // t1 <- a10 + a12 + 2*a14
448
449	// U[0]  s5	a9 + a11 + 2*(a13 + a15)
450	// U[1]  t1	a10 + a12 + 2*a14
451	// U[2] -t3	a8 + a9 + a13 + a14
452	// U[3]  s2	a8 + a11 + a12 + 2*a13 + a14 + a15
453	// U[4]  s4	a9 + a13 + a15
454	// U[5]  t4	a10 + a14
455	// U[6]  s7	a11 + a15
456	// U[7]  s1	a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
457
458	// 4. 32-bit to 64-bit
459	lsl $s0,$t1,#32
460	extr $t1,$s2,$t1,#32
461	extr $s2,$t4,$s2,#32
462	extr $t4,$s1,$t4,#32
463	lsr $s1,$s1,#32
464
465	// 5. 64-bit addition
466	adds $s5,$s5,$s0
467	adcs $t1,$t1,xzr
468	adcs $s4,$s4,$s2
469	adcs $s7,$s7,$t4
470	adcs $t0,$t0,$s1
471
472	// V[0]	s5
473	// V[1]	t1
474	// V[2]	s4
475	// V[3]	s7
476	// carry	t0
477	// sub	t3
478
479	// 5. Process s0-s3
480	ldp $s0,$s1,[sp,#32]
481	ldp $s2,$s3,[sp,#48]
482	// add with V0-V3
483	adds $s0,$s0,$s5
484	adcs $s1,$s1,$t1
485	adcs $s2,$s2,$s4
486	adcs $s3,$s3,$s7
487	adcs $t0,$t0,xzr
488	// sub with t3
489	subs $s1,$s1,$t3
490	sbcs $s2,$s2,xzr
491	sbcs $s3,$s3,xzr
492	sbcs $t0,$t0,xzr
493
494	// 6. MOD
495	// First Mod
496	lsl $t1,$t0,#32
497	subs $t2,$t1,$t0
498
499	adds $s0,$s0,$t0
500	adcs $s1,$s1,$t2
501	adcs $s2,$s2,xzr
502	adcs $s3,$s3,$t1
503
504	// Last Mod
505	// return y - p if y > p else y
506	mov $s4,$s0
507	mov $s5,$s1
508	mov $s6,$s2
509	mov $s7,$s3
510
511	adr $t0,.Lpoly
512	ldp $t1,$t2,[$t0]
513	ldp $t3,$t4,[$t0,#16]
514
515	adcs $t5,xzr,xzr
516
517	subs $s0,$s0,$t1
518	sbcs $s1,$s1,$t2
519	sbcs $s2,$s2,$t3
520	sbcs $s3,$s3,$t4
521	sbcs $t5,$t5,xzr
522
523	csel $s0,$s0,$s4,cs
524	csel $s1,$s1,$s5,cs
525	csel $s2,$s2,$s6,cs
526	csel $s3,$s3,$s7,cs
527
528.endm
529
530// void ecp_sm2p256_mul(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
531.globl	ecp_sm2p256_mul
532.type	ecp_sm2p256_mul,%function
533.align	5
534ecp_sm2p256_mul:
535	AARCH64_SIGN_LINK_REGISTER
536	// Store scalar registers
537	stp x29,x30,[sp,#-80]!
538	add x29,sp,#0
539	stp x16,x17,[sp,#16]
540	stp x19,x20,[sp,#64]
541
542	// Load inputs
543	ldp $s0,$s1,[x1]
544	ldp $s2,$s3,[x1,#16]
545	ldp $s4,$s5,[x2]
546	ldp $s6,$s7,[x2,#16]
547
548// ### multiplication ###
549	// ========================
550	//             s3 s2 s1 s0
551	// *           s7 s6 s5 s4
552	// ------------------------
553	// +           s0 s0 s0 s0
554	//              *  *  *  *
555	//             s7 s6 s5 s4
556	//          s1 s1 s1 s1
557	//           *  *  *  *
558	//          s7 s6 s5 s4
559	//       s2 s2 s2 s2
560	//        *  *  *  *
561	//       s7 s6 s5 s4
562	//    s3 s3 s3 s3
563	//     *  *  *  *
564	//    s7 s6 s5 s4
565	// ------------------------
566	// s7 s6 s5 s4 s3 s2 s1 s0
567	// ========================
568
569// ### s0*s4 ###
570	mul $t5,$s0,$s4
571	umulh $t2,$s0,$s4
572
573// ### s1*s4 + s0*s5 ###
574	mul $t0,$s1,$s4
575	umulh $t1,$s1,$s4
576	adds $t2,$t2,$t0
577	adcs $t3,$t1,xzr
578
579	mul $t0,$s0,$s5
580	umulh $t1,$s0,$s5
581	adds $t2,$t2,$t0
582	adcs $t3,$t3,$t1
583	adcs $t4,xzr,xzr
584
585// ### s2*s4 + s1*s5 + s0*s6 ###
586	mul $t0,$s2,$s4
587	umulh $t1,$s2,$s4
588	adds $t3,$t3,$t0
589	adcs $t4,$t4,$t1
590
591	mul $t0,$s1,$s5
592	umulh $t1,$s1,$s5
593	adds $t3,$t3,$t0
594	adcs $t4,$t4,$t1
595	adcs $t6,xzr,xzr
596
597	mul $t0,$s0,$s6
598	umulh $t1,$s0,$s6
599	adds $t3,$t3,$t0
600	adcs $t4,$t4,$t1
601	adcs $t6,$t6,xzr
602
603// ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ###
604	mul $t0,$s3,$s4
605	umulh $t1,$s3,$s4
606	adds $t4,$t4,$t0
607	adcs $t6,$t6,$t1
608	adcs $t7,xzr,xzr
609
610	mul $t0,$s2,$s5
611	umulh $t1,$s2,$s5
612	adds $t4,$t4,$t0
613	adcs $t6,$t6,$t1
614	adcs $t7,$t7,xzr
615
616	mul $t0,$s1,$s6
617	umulh $t1,$s1,$s6
618	adds $t4,$t4,$t0
619	adcs $t6,$t6,$t1
620	adcs $t7,$t7,xzr
621
622	mul $t0,$s0,$s7
623	umulh $t1,$s0,$s7
624	adds $t4,$t4,$t0
625	adcs $t6,$t6,$t1
626	adcs $t7,$t7,xzr
627
628// ### s3*s5 + s2*s6 + s1*s7 ###
629	mul $t0,$s3,$s5
630	umulh $t1,$s3,$s5
631	adds $t6,$t6,$t0
632	adcs $t7,$t7,$t1
633	adcs $t8,xzr,xzr
634
635	mul $t0,$s2,$s6
636	umulh $t1,$s2,$s6
637	adds $t6,$t6,$t0
638	adcs $t7,$t7,$t1
639	adcs $t8,$t8,xzr
640
641	mul $t0,$s1,$s7
642	umulh $t1,$s1,$s7
643	adds $s4,$t6,$t0
644	adcs $t7,$t7,$t1
645	adcs $t8,$t8,xzr
646
647// ### s3*s6 + s2*s7 ###
648	mul $t0,$s3,$s6
649	umulh $t1,$s3,$s6
650	adds $t7,$t7,$t0
651	adcs $t8,$t8,$t1
652	adcs $t6,xzr,xzr
653
654	mul $t0,$s2,$s7
655	umulh $t1,$s2,$s7
656	adds $s5,$t7,$t0
657	adcs $t8,$t8,$t1
658	adcs $t6,$t6,xzr
659
660// ### s3*s7 ###
661	mul $t0,$s3,$s7
662	umulh $t1,$s3,$s7
663	adds $s6,$t8,$t0
664	adcs $s7,$t6,$t1
665
666	mov $s0,$t5
667	mov $s1,$t2
668	mov $s2,$t3
669	mov $s3,$t4
670
671	// result of mul: s7 s6 s5 s4 s3 s2 s1 s0
672
673// ### Reduction ###
674	RDC
675
676	stp $s0,$s1,[x0]
677	stp $s2,$s3,[x0,#16]
678
679	// Restore scalar registers
680	ldp x16,x17,[sp,#16]
681	ldp x19,x20,[sp,#64]
682	ldp x29,x30,[sp],#80
683
684	AARCH64_VALIDATE_LINK_REGISTER
685	ret
686.size ecp_sm2p256_mul,.-ecp_sm2p256_mul
687
688// void ecp_sm2p256_sqr(BN_ULONG *r, const BN_ULONG *a);
689.globl	ecp_sm2p256_sqr
690.type	ecp_sm2p256_sqr,%function
691.align	5
692
693ecp_sm2p256_sqr:
694	AARCH64_SIGN_LINK_REGISTER
695	// Store scalar registers
696	stp x29,x30,[sp,#-80]!
697	add x29,sp,#0
698	stp x16,x17,[sp,#16]
699	stp x19,x20,[sp,#64]
700
701	// Load inputs
702	ldp $s4,$s5,[x1]
703	ldp $s6,$s7,[x1,#16]
704
705// ### square ###
706	// ========================
707	//             s7 s6 s5 s4
708	// *           s7 s6 s5 s4
709	// ------------------------
710	// +           s4 s4 s4 s4
711	//              *  *  *  *
712	//             s7 s6 s5 s4
713	//          s5 s5 s5 s5
714	//           *  *  *  *
715	//          s7 s6 s5 s4
716	//       s6 s6 s6 s6
717	//        *  *  *  *
718	//       s7 s6 s5 s4
719	//    s7 s7 s7 s7
720	//     *  *  *  *
721	//    s7 s6 s5 s4
722	// ------------------------
723	// s7 s6 s5 s4 s3 s2 s1 s0
724	// ========================
725
726// ### s4*s5 ###
727	mul $s1,$s4,$s5
728	umulh $s2,$s4,$s5
729
730// ### s4*s6 ###
731	mul $t0,$s6,$s4
732	umulh $s3,$s6,$s4
733	adds $s2,$s2,$t0
734	adcs $s3,$s3,xzr
735
736// ### s4*s7 + s5*s6 ###
737	mul $t0,$s7,$s4
738	umulh $t1,$s7,$s4
739	adds $s3,$s3,$t0
740	adcs $s0,$t1,xzr
741
742	mul $t0,$s6,$s5
743	umulh $t1,$s6,$s5
744	adds $s3,$s3,$t0
745	adcs $s0,$s0,$t1
746	adcs $t2,xzr,xzr
747
748// ### s5*s7 ###
749	mul $t0,$s7,$s5
750	umulh $t1,$s7,$s5
751	adds $s0,$s0,$t0
752	adcs $t2,$t2,$t1
753
754// ### s6*s7 ###
755	mul $t0,$s7,$s6
756	umulh $t1,$s7,$s6
757	adds $t2,$t2,$t0
758	adcs $t3,$t1,xzr
759
760// ### 2*(t3,t2,s0,s3,s2,s1) ###
761	adds $s1,$s1,$s1
762	adcs $s2,$s2,$s2
763	adcs $s3,$s3,$s3
764	adcs $s0,$s0,$s0
765	adcs $t2,$t2,$t2
766	adcs $t3,$t3,$t3
767	adcs $t4,xzr,xzr
768
769// ### s4*s4 ###
770	mul $t5,$s4,$s4
771	umulh $t6,$s4,$s4
772
773// ### s5*s5 ###
774	mul $s4,$s5,$s5
775	umulh $s5,$s5,$s5
776
777// ### s6*s6 ###
778	mul $t0,$s6,$s6
779	umulh $t1,$s6,$s6
780
781// ### s7*s7 ###
782	mul $t7,$s7,$s7
783	umulh $t8,$s7,$s7
784
785	adds $s1,$s1,$t6
786	adcs $s2,$s2,$s4
787	adcs $s3,$s3,$s5
788	adcs $s0,$s0,$t0
789	adcs $t2,$t2,$t1
790	adcs $t3,$t3,$t7
791	adcs $t4,$t4,$t8
792
793	mov $s4,$s0
794	mov $s0,$t5
795	mov $s5,$t2
796	mov $s6,$t3
797	mov $s7,$t4
798
799	// result of mul: s7 s6 s5 s4 s3 s2 s1 s0
800
801// ### Reduction ###
802	RDC
803
804	stp $s0,$s1,[x0]
805	stp $s2,$s3,[x0,#16]
806
807	// Restore scalar registers
808	ldp x16,x17,[sp,#16]
809	ldp x19,x20,[sp,#64]
810	ldp x29,x30,[sp],#80
811
812	AARCH64_VALIDATE_LINK_REGISTER
813	ret
814.size ecp_sm2p256_sqr,.-ecp_sm2p256_sqr
815___
816}
817
818foreach (split("\n",$code)) {
819	s/\`([^\`]*)\`/eval $1/ge;
820
821	print $_,"\n";
822}
823close STDOUT or die "error closing STDOUT: $!";		# enforce flush
824