xref: /openssl/crypto/ec/asm/ecp_nistz256-ppc64.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# ECP_NISTZ256 module for PPC64.
18#
19# August 2016.
20#
21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22# http://eprint.iacr.org/2013/816.
23#
24#			with/without -DECP_NISTZ256_ASM
25# POWER7		+260-530%
26# POWER8		+220-340%
27
28# $output is the last argument if it looks like a file (it has an extension)
29# $flavour is the first argument if it doesn't look like a file
30$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
31$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
32
33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
35( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
36die "can't locate ppc-xlate.pl";
37
38open OUT,"| \"$^X\" $xlate $flavour \"$output\""
39    or die "can't call $xlate: $!";
40*STDOUT=*OUT;
41
42my $sp="r1";
43
44{
45my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3,
46    $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) =
47    map("r$_",(3..12,22..31));
48
49my ($acc6,$acc7)=($bp,$bi);	# used in __ecp_nistz256_sqr_mont
50
51$code.=<<___;
52.machine	"any"
53.text
54___
55########################################################################
56# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
57#
58$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59open TABLE,"<ecp_nistz256_table.c"		or
60open TABLE,"<${dir}../ecp_nistz256_table.c"	or
61die "failed to open ecp_nistz256_table.c:",$!;
62
63use integer;
64
65foreach(<TABLE>) {
66	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
67}
68close TABLE;
69
70# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
71# 64*16*37-1 is because $#arr returns last valid index or @arr, not
72# amount of elements.
73die "insane number of elements" if ($#arr != 64*16*37-1);
74
75$code.=<<___;
76.type	ecp_nistz256_precomputed,\@object
77.globl	ecp_nistz256_precomputed
78.align	12
79ecp_nistz256_precomputed:
80___
81########################################################################
82# this conversion smashes P256_POINT_AFFINE by individual bytes with
83# 64 byte interval, similar to
84#	1111222233334444
85#	1234123412341234
86for(1..37) {
87	@tbl = splice(@arr,0,64*16);
88	for($i=0;$i<64;$i++) {
89		undef @line;
90		for($j=0;$j<64;$j++) {
91			push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
92		}
93		$code.=".byte\t";
94		$code.=join(',',map { sprintf "0x%02x",$_} @line);
95		$code.="\n";
96	}
97}
98
99$code.=<<___;
100.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
101.asciz	"ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
102
103# void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
104#					     const BN_ULONG x2[4]);
105.globl	ecp_nistz256_mul_mont
106.align	5
107ecp_nistz256_mul_mont:
108	stdu	$sp,-128($sp)
109	mflr	r0
110	std	r22,48($sp)
111	std	r23,56($sp)
112	std	r24,64($sp)
113	std	r25,72($sp)
114	std	r26,80($sp)
115	std	r27,88($sp)
116	std	r28,96($sp)
117	std	r29,104($sp)
118	std	r30,112($sp)
119	std	r31,120($sp)
120
121	ld	$a0,0($ap)
122	ld	$bi,0($bp)
123	ld	$a1,8($ap)
124	ld	$a2,16($ap)
125	ld	$a3,24($ap)
126
127	li	$poly1,-1
128	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
129	li	$poly3,1
130	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
131
132	bl	__ecp_nistz256_mul_mont
133
134	mtlr	r0
135	ld	r22,48($sp)
136	ld	r23,56($sp)
137	ld	r24,64($sp)
138	ld	r25,72($sp)
139	ld	r26,80($sp)
140	ld	r27,88($sp)
141	ld	r28,96($sp)
142	ld	r29,104($sp)
143	ld	r30,112($sp)
144	ld	r31,120($sp)
145	addi	$sp,$sp,128
146	blr
147	.long	0
148	.byte	0,12,4,0,0x80,10,3,0
149	.long	0
150.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
151
152# void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
153.globl	ecp_nistz256_sqr_mont
154.align	4
155ecp_nistz256_sqr_mont:
156	stdu	$sp,-128($sp)
157	mflr	r0
158	std	r22,48($sp)
159	std	r23,56($sp)
160	std	r24,64($sp)
161	std	r25,72($sp)
162	std	r26,80($sp)
163	std	r27,88($sp)
164	std	r28,96($sp)
165	std	r29,104($sp)
166	std	r30,112($sp)
167	std	r31,120($sp)
168
169	ld	$a0,0($ap)
170	ld	$a1,8($ap)
171	ld	$a2,16($ap)
172	ld	$a3,24($ap)
173
174	li	$poly1,-1
175	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
176	li	$poly3,1
177	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
178
179	bl	__ecp_nistz256_sqr_mont
180
181	mtlr	r0
182	ld	r22,48($sp)
183	ld	r23,56($sp)
184	ld	r24,64($sp)
185	ld	r25,72($sp)
186	ld	r26,80($sp)
187	ld	r27,88($sp)
188	ld	r28,96($sp)
189	ld	r29,104($sp)
190	ld	r30,112($sp)
191	ld	r31,120($sp)
192	addi	$sp,$sp,128
193	blr
194	.long	0
195	.byte	0,12,4,0,0x80,10,2,0
196	.long	0
197.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
198
199# void	ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
200#					const BN_ULONG x2[4]);
201.globl	ecp_nistz256_add
202.align	4
203ecp_nistz256_add:
204	stdu	$sp,-128($sp)
205	mflr	r0
206	std	r28,96($sp)
207	std	r29,104($sp)
208	std	r30,112($sp)
209	std	r31,120($sp)
210
211	ld	$acc0,0($ap)
212	ld	$t0,  0($bp)
213	ld	$acc1,8($ap)
214	ld	$t1,  8($bp)
215	ld	$acc2,16($ap)
216	ld	$t2,  16($bp)
217	ld	$acc3,24($ap)
218	ld	$t3,  24($bp)
219
220	li	$poly1,-1
221	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
222	li	$poly3,1
223	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
224
225	bl	__ecp_nistz256_add
226
227	mtlr	r0
228	ld	r28,96($sp)
229	ld	r29,104($sp)
230	ld	r30,112($sp)
231	ld	r31,120($sp)
232	addi	$sp,$sp,128
233	blr
234	.long	0
235	.byte	0,12,4,0,0x80,4,3,0
236	.long	0
237.size	ecp_nistz256_add,.-ecp_nistz256_add
238
239# void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
240.globl	ecp_nistz256_div_by_2
241.align	4
242ecp_nistz256_div_by_2:
243	stdu	$sp,-128($sp)
244	mflr	r0
245	std	r28,96($sp)
246	std	r29,104($sp)
247	std	r30,112($sp)
248	std	r31,120($sp)
249
250	ld	$acc0,0($ap)
251	ld	$acc1,8($ap)
252	ld	$acc2,16($ap)
253	ld	$acc3,24($ap)
254
255	li	$poly1,-1
256	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
257	li	$poly3,1
258	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
259
260	bl	__ecp_nistz256_div_by_2
261
262	mtlr	r0
263	ld	r28,96($sp)
264	ld	r29,104($sp)
265	ld	r30,112($sp)
266	ld	r31,120($sp)
267	addi	$sp,$sp,128
268	blr
269	.long	0
270	.byte	0,12,4,0,0x80,4,2,0
271	.long	0
272.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
273
274# void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
275.globl	ecp_nistz256_mul_by_2
276.align	4
277ecp_nistz256_mul_by_2:
278	stdu	$sp,-128($sp)
279	mflr	r0
280	std	r28,96($sp)
281	std	r29,104($sp)
282	std	r30,112($sp)
283	std	r31,120($sp)
284
285	ld	$acc0,0($ap)
286	ld	$acc1,8($ap)
287	ld	$acc2,16($ap)
288	ld	$acc3,24($ap)
289
290	mr	$t0,$acc0
291	mr	$t1,$acc1
292	mr	$t2,$acc2
293	mr	$t3,$acc3
294
295	li	$poly1,-1
296	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
297	li	$poly3,1
298	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
299
300	bl	__ecp_nistz256_add	# ret = a+a	// 2*a
301
302	mtlr	r0
303	ld	r28,96($sp)
304	ld	r29,104($sp)
305	ld	r30,112($sp)
306	ld	r31,120($sp)
307	addi	$sp,$sp,128
308	blr
309	.long	0
310	.byte	0,12,4,0,0x80,4,3,0
311	.long	0
312.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
313
314# void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
315.globl	ecp_nistz256_mul_by_3
316.align	4
317ecp_nistz256_mul_by_3:
318	stdu	$sp,-128($sp)
319	mflr	r0
320	std	r28,96($sp)
321	std	r29,104($sp)
322	std	r30,112($sp)
323	std	r31,120($sp)
324
325	ld	$acc0,0($ap)
326	ld	$acc1,8($ap)
327	ld	$acc2,16($ap)
328	ld	$acc3,24($ap)
329
330	mr	$t0,$acc0
331	std	$acc0,64($sp)
332	mr	$t1,$acc1
333	std	$acc1,72($sp)
334	mr	$t2,$acc2
335	std	$acc2,80($sp)
336	mr	$t3,$acc3
337	std	$acc3,88($sp)
338
339	li	$poly1,-1
340	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
341	li	$poly3,1
342	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
343
344	bl	__ecp_nistz256_add	# ret = a+a	// 2*a
345
346	ld	$t0,64($sp)
347	ld	$t1,72($sp)
348	ld	$t2,80($sp)
349	ld	$t3,88($sp)
350
351	bl	__ecp_nistz256_add	# ret += a	// 2*a+a=3*a
352
353	mtlr	r0
354	ld	r28,96($sp)
355	ld	r29,104($sp)
356	ld	r30,112($sp)
357	ld	r31,120($sp)
358	addi	$sp,$sp,128
359	blr
360	.long	0
361	.byte	0,12,4,0,0x80,4,2,0
362	.long	0
363.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
364
365# void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
366#				        const BN_ULONG x2[4]);
367.globl	ecp_nistz256_sub
368.align	4
369ecp_nistz256_sub:
370	stdu	$sp,-128($sp)
371	mflr	r0
372	std	r28,96($sp)
373	std	r29,104($sp)
374	std	r30,112($sp)
375	std	r31,120($sp)
376
377	ld	$acc0,0($ap)
378	ld	$acc1,8($ap)
379	ld	$acc2,16($ap)
380	ld	$acc3,24($ap)
381
382	li	$poly1,-1
383	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
384	li	$poly3,1
385	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
386
387	bl	__ecp_nistz256_sub_from
388
389	mtlr	r0
390	ld	r28,96($sp)
391	ld	r29,104($sp)
392	ld	r30,112($sp)
393	ld	r31,120($sp)
394	addi	$sp,$sp,128
395	blr
396	.long	0
397	.byte	0,12,4,0,0x80,4,3,0
398	.long	0
399.size	ecp_nistz256_sub,.-ecp_nistz256_sub
400
401# void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
402.globl	ecp_nistz256_neg
403.align	4
404ecp_nistz256_neg:
405	stdu	$sp,-128($sp)
406	mflr	r0
407	std	r28,96($sp)
408	std	r29,104($sp)
409	std	r30,112($sp)
410	std	r31,120($sp)
411
412	mr	$bp,$ap
413	li	$acc0,0
414	li	$acc1,0
415	li	$acc2,0
416	li	$acc3,0
417
418	li	$poly1,-1
419	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
420	li	$poly3,1
421	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
422
423	bl	__ecp_nistz256_sub_from
424
425	mtlr	r0
426	ld	r28,96($sp)
427	ld	r29,104($sp)
428	ld	r30,112($sp)
429	ld	r31,120($sp)
430	addi	$sp,$sp,128
431	blr
432	.long	0
433	.byte	0,12,4,0,0x80,4,2,0
434	.long	0
435.size	ecp_nistz256_neg,.-ecp_nistz256_neg
436
437# note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
438# to $a0-$a3 and b[0] - to $bi
439.type	__ecp_nistz256_mul_mont,\@function
440.align	4
441__ecp_nistz256_mul_mont:
442	mulld	$acc0,$a0,$bi		# a[0]*b[0]
443	mulhdu	$t0,$a0,$bi
444
445	mulld	$acc1,$a1,$bi		# a[1]*b[0]
446	mulhdu	$t1,$a1,$bi
447
448	mulld	$acc2,$a2,$bi		# a[2]*b[0]
449	mulhdu	$t2,$a2,$bi
450
451	mulld	$acc3,$a3,$bi		# a[3]*b[0]
452	mulhdu	$t3,$a3,$bi
453	ld	$bi,8($bp)		# b[1]
454
455	addc	$acc1,$acc1,$t0		# accumulate high parts of multiplication
456	 sldi	$t0,$acc0,32
457	adde	$acc2,$acc2,$t1
458	 srdi	$t1,$acc0,32
459	adde	$acc3,$acc3,$t2
460	addze	$acc4,$t3
461	li	$acc5,0
462___
463for($i=1;$i<4;$i++) {
464	################################################################
465	# Reduction iteration is normally performed by accumulating
466	# result of multiplication of modulus by "magic" digit [and
467	# omitting least significant word, which is guaranteed to
468	# be 0], but thanks to special form of modulus and "magic"
469	# digit being equal to least significant word, it can be
470	# performed with additions and subtractions alone. Indeed:
471	#
472	#            ffff0001.00000000.0000ffff.ffffffff
473	# *                                     abcdefgh
474	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
475	#
476	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
477	# rewrite above as:
478	#
479	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
480	# + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
481	# - 0000abcd.efgh0000.00000000.00000000.abcdefgh
482	#
483	# or marking redundant operations:
484	#
485	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
486	# + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
487	# - 0000abcd.efgh0000.--------.--------.--------
488
489$code.=<<___;
490	subfc	$t2,$t0,$acc0		# "*0xffff0001"
491	subfe	$t3,$t1,$acc0
492	addc	$acc0,$acc1,$t0		# +=acc[0]<<96 and omit acc[0]
493	adde	$acc1,$acc2,$t1
494	adde	$acc2,$acc3,$t2		# +=acc[0]*0xffff0001
495	adde	$acc3,$acc4,$t3
496	addze	$acc4,$acc5
497
498	mulld	$t0,$a0,$bi		# lo(a[0]*b[i])
499	mulld	$t1,$a1,$bi		# lo(a[1]*b[i])
500	mulld	$t2,$a2,$bi		# lo(a[2]*b[i])
501	mulld	$t3,$a3,$bi		# lo(a[3]*b[i])
502	addc	$acc0,$acc0,$t0		# accumulate low parts of multiplication
503	 mulhdu	$t0,$a0,$bi		# hi(a[0]*b[i])
504	adde	$acc1,$acc1,$t1
505	 mulhdu	$t1,$a1,$bi		# hi(a[1]*b[i])
506	adde	$acc2,$acc2,$t2
507	 mulhdu	$t2,$a2,$bi		# hi(a[2]*b[i])
508	adde	$acc3,$acc3,$t3
509	 mulhdu	$t3,$a3,$bi		# hi(a[3]*b[i])
510	addze	$acc4,$acc4
511___
512$code.=<<___	if ($i<3);
513	ld	$bi,8*($i+1)($bp)	# b[$i+1]
514___
515$code.=<<___;
516	addc	$acc1,$acc1,$t0		# accumulate high parts of multiplication
517	 sldi	$t0,$acc0,32
518	adde	$acc2,$acc2,$t1
519	 srdi	$t1,$acc0,32
520	adde	$acc3,$acc3,$t2
521	adde	$acc4,$acc4,$t3
522	li	$acc5,0
523	addze	$acc5,$acc5
524___
525}
526$code.=<<___;
527	# last reduction
528	subfc	$t2,$t0,$acc0		# "*0xffff0001"
529	subfe	$t3,$t1,$acc0
530	addc	$acc0,$acc1,$t0		# +=acc[0]<<96 and omit acc[0]
531	adde	$acc1,$acc2,$t1
532	adde	$acc2,$acc3,$t2		# +=acc[0]*0xffff0001
533	adde	$acc3,$acc4,$t3
534	addze	$acc4,$acc5
535
536	li	$t2,0
537	addic	$acc0,$acc0,1		# ret -= modulus
538	subfe	$acc1,$poly1,$acc1
539	subfe	$acc2,$t2,$acc2
540	subfe	$acc3,$poly3,$acc3
541	subfe	$acc4,$t2,$acc4
542
543	addc	$acc0,$acc0,$acc4	# ret += modulus if borrow
544	and	$t1,$poly1,$acc4
545	and	$t3,$poly3,$acc4
546	adde	$acc1,$acc1,$t1
547	addze	$acc2,$acc2
548	adde	$acc3,$acc3,$t3
549
550	std	$acc0,0($rp)
551	std	$acc1,8($rp)
552	std	$acc2,16($rp)
553	std	$acc3,24($rp)
554
555	blr
556	.long	0
557	.byte	0,12,0x14,0,0,0,1,0
558	.long	0
559.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
560
561# note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
562# to $a0-$a3
563.type	__ecp_nistz256_sqr_mont,\@function
564.align	4
565__ecp_nistz256_sqr_mont:
566	################################################################
567	#  |  |  |  |  |  |a1*a0|  |
568	#  |  |  |  |  |a2*a0|  |  |
569	#  |  |a3*a2|a3*a0|  |  |  |
570	#  |  |  |  |a2*a1|  |  |  |
571	#  |  |  |a3*a1|  |  |  |  |
572	# *|  |  |  |  |  |  |  | 2|
573	# +|a3*a3|a2*a2|a1*a1|a0*a0|
574	#  |--+--+--+--+--+--+--+--|
575	#  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
576	#
577	#  "can't overflow" below mark carrying into high part of
578	#  multiplication result, which can't overflow, because it
579	#  can never be all ones.
580
581	mulld	$acc1,$a1,$a0		# a[1]*a[0]
582	mulhdu	$t1,$a1,$a0
583	mulld	$acc2,$a2,$a0		# a[2]*a[0]
584	mulhdu	$t2,$a2,$a0
585	mulld	$acc3,$a3,$a0		# a[3]*a[0]
586	mulhdu	$acc4,$a3,$a0
587
588	addc	$acc2,$acc2,$t1		# accumulate high parts of multiplication
589	 mulld	$t0,$a2,$a1		# a[2]*a[1]
590	 mulhdu	$t1,$a2,$a1
591	adde	$acc3,$acc3,$t2
592	 mulld	$t2,$a3,$a1		# a[3]*a[1]
593	 mulhdu	$t3,$a3,$a1
594	addze	$acc4,$acc4		# can't overflow
595
596	mulld	$acc5,$a3,$a2		# a[3]*a[2]
597	mulhdu	$acc6,$a3,$a2
598
599	addc	$t1,$t1,$t2		# accumulate high parts of multiplication
600	addze	$t2,$t3			# can't overflow
601
602	addc	$acc3,$acc3,$t0		# accumulate low parts of multiplication
603	adde	$acc4,$acc4,$t1
604	adde	$acc5,$acc5,$t2
605	addze	$acc6,$acc6		# can't overflow
606
607	addc	$acc1,$acc1,$acc1	# acc[1-6]*=2
608	adde	$acc2,$acc2,$acc2
609	adde	$acc3,$acc3,$acc3
610	adde	$acc4,$acc4,$acc4
611	adde	$acc5,$acc5,$acc5
612	adde	$acc6,$acc6,$acc6
613	li	$acc7,0
614	addze	$acc7,$acc7
615
616	mulld	$acc0,$a0,$a0		# a[0]*a[0]
617	mulhdu	$a0,$a0,$a0
618	mulld	$t1,$a1,$a1		# a[1]*a[1]
619	mulhdu	$a1,$a1,$a1
620	mulld	$t2,$a2,$a2		# a[2]*a[2]
621	mulhdu	$a2,$a2,$a2
622	mulld	$t3,$a3,$a3		# a[3]*a[3]
623	mulhdu	$a3,$a3,$a3
624	addc	$acc1,$acc1,$a0		# +a[i]*a[i]
625	 sldi	$t0,$acc0,32
626	adde	$acc2,$acc2,$t1
627	 srdi	$t1,$acc0,32
628	adde	$acc3,$acc3,$a1
629	adde	$acc4,$acc4,$t2
630	adde	$acc5,$acc5,$a2
631	adde	$acc6,$acc6,$t3
632	adde	$acc7,$acc7,$a3
633___
634for($i=0;$i<3;$i++) {			# reductions, see commentary in
635					# multiplication for details
636$code.=<<___;
637	subfc	$t2,$t0,$acc0		# "*0xffff0001"
638	subfe	$t3,$t1,$acc0
639	addc	$acc0,$acc1,$t0		# +=acc[0]<<96 and omit acc[0]
640	 sldi	$t0,$acc0,32
641	adde	$acc1,$acc2,$t1
642	 srdi	$t1,$acc0,32
643	adde	$acc2,$acc3,$t2		# +=acc[0]*0xffff0001
644	addze	$acc3,$t3		# can't overflow
645___
646}
647$code.=<<___;
648	subfc	$t2,$t0,$acc0		# "*0xffff0001"
649	subfe	$t3,$t1,$acc0
650	addc	$acc0,$acc1,$t0		# +=acc[0]<<96 and omit acc[0]
651	adde	$acc1,$acc2,$t1
652	adde	$acc2,$acc3,$t2		# +=acc[0]*0xffff0001
653	addze	$acc3,$t3		# can't overflow
654
655	addc	$acc0,$acc0,$acc4	# accumulate upper half
656	adde	$acc1,$acc1,$acc5
657	adde	$acc2,$acc2,$acc6
658	adde	$acc3,$acc3,$acc7
659	li	$t2,0
660	addze	$acc4,$t2
661
662	addic	$acc0,$acc0,1		# ret -= modulus
663	subfe	$acc1,$poly1,$acc1
664	subfe	$acc2,$t2,$acc2
665	subfe	$acc3,$poly3,$acc3
666	subfe	$acc4,$t2,$acc4
667
668	addc	$acc0,$acc0,$acc4	# ret += modulus if borrow
669	and	$t1,$poly1,$acc4
670	and	$t3,$poly3,$acc4
671	adde	$acc1,$acc1,$t1
672	addze	$acc2,$acc2
673	adde	$acc3,$acc3,$t3
674
675	std	$acc0,0($rp)
676	std	$acc1,8($rp)
677	std	$acc2,16($rp)
678	std	$acc3,24($rp)
679
680	blr
681	.long	0
682	.byte	0,12,0x14,0,0,0,1,0
683	.long	0
684.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
685
686# Note that __ecp_nistz256_add expects both input vectors pre-loaded to
687# $a0-$a3 and $t0-$t3. This is done because it's used in multiple
688# contexts, e.g. in multiplication by 2 and 3...
689.type	__ecp_nistz256_add,\@function
690.align	4
691__ecp_nistz256_add:
692	addc	$acc0,$acc0,$t0		# ret = a+b
693	adde	$acc1,$acc1,$t1
694	adde	$acc2,$acc2,$t2
695	li	$t2,0
696	adde	$acc3,$acc3,$t3
697	addze	$t0,$t2
698
699	# if a+b >= modulus, subtract modulus
700	#
701	# But since comparison implies subtraction, we subtract
702	# modulus and then add it back if subtraction borrowed.
703
704	subic	$acc0,$acc0,-1
705	subfe	$acc1,$poly1,$acc1
706	subfe	$acc2,$t2,$acc2
707	subfe	$acc3,$poly3,$acc3
708	subfe	$t0,$t2,$t0
709
710	addc	$acc0,$acc0,$t0
711	and	$t1,$poly1,$t0
712	and	$t3,$poly3,$t0
713	adde	$acc1,$acc1,$t1
714	addze	$acc2,$acc2
715	adde	$acc3,$acc3,$t3
716
717	std	$acc0,0($rp)
718	std	$acc1,8($rp)
719	std	$acc2,16($rp)
720	std	$acc3,24($rp)
721
722	blr
723	.long	0
724	.byte	0,12,0x14,0,0,0,3,0
725	.long	0
726.size	__ecp_nistz256_add,.-__ecp_nistz256_add
727
728.type	__ecp_nistz256_sub_from,\@function
729.align	4
730__ecp_nistz256_sub_from:
731	ld	$t0,0($bp)
732	ld	$t1,8($bp)
733	ld	$t2,16($bp)
734	ld	$t3,24($bp)
735	subfc	$acc0,$t0,$acc0		# ret = a-b
736	subfe	$acc1,$t1,$acc1
737	subfe	$acc2,$t2,$acc2
738	subfe	$acc3,$t3,$acc3
739	subfe	$t0,$t0,$t0		# t0 = borrow ? -1 : 0
740
741	# if a-b borrowed, add modulus
742
743	addc	$acc0,$acc0,$t0		# ret -= modulus & t0
744	and	$t1,$poly1,$t0
745	and	$t3,$poly3,$t0
746	adde	$acc1,$acc1,$t1
747	addze	$acc2,$acc2
748	adde	$acc3,$acc3,$t3
749
750	std	$acc0,0($rp)
751	std	$acc1,8($rp)
752	std	$acc2,16($rp)
753	std	$acc3,24($rp)
754
755	blr
756	.long	0
757	.byte	0,12,0x14,0,0,0,3,0
758	.long	0
759.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
760
761.type	__ecp_nistz256_sub_morf,\@function
762.align	4
763__ecp_nistz256_sub_morf:
764	ld	$t0,0($bp)
765	ld	$t1,8($bp)
766	ld	$t2,16($bp)
767	ld	$t3,24($bp)
768	subfc	$acc0,$acc0,$t0 	# ret = b-a
769	subfe	$acc1,$acc1,$t1
770	subfe	$acc2,$acc2,$t2
771	subfe	$acc3,$acc3,$t3
772	subfe	$t0,$t0,$t0		# t0 = borrow ? -1 : 0
773
774	# if b-a borrowed, add modulus
775
776	addc	$acc0,$acc0,$t0		# ret -= modulus & t0
777	and	$t1,$poly1,$t0
778	and	$t3,$poly3,$t0
779	adde	$acc1,$acc1,$t1
780	addze	$acc2,$acc2
781	adde	$acc3,$acc3,$t3
782
783	std	$acc0,0($rp)
784	std	$acc1,8($rp)
785	std	$acc2,16($rp)
786	std	$acc3,24($rp)
787
788	blr
789	.long	0
790	.byte	0,12,0x14,0,0,0,3,0
791	.long	0
792.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
793
794.type	__ecp_nistz256_div_by_2,\@function
795.align	4
796__ecp_nistz256_div_by_2:
797	andi.	$t0,$acc0,1
798	addic	$acc0,$acc0,-1		# a += modulus
799	 neg	$t0,$t0
800	adde	$acc1,$acc1,$poly1
801	 not	$t0,$t0
802	addze	$acc2,$acc2
803	 li	$t2,0
804	adde	$acc3,$acc3,$poly3
805	 and	$t1,$poly1,$t0
806	addze	$ap,$t2			# ap = carry
807	 and	$t3,$poly3,$t0
808
809	subfc	$acc0,$t0,$acc0		# a -= modulus if a was even
810	subfe	$acc1,$t1,$acc1
811	subfe	$acc2,$t2,$acc2
812	subfe	$acc3,$t3,$acc3
813	subfe	$ap,  $t2,$ap
814
815	srdi	$acc0,$acc0,1
816	sldi	$t0,$acc1,63
817	srdi	$acc1,$acc1,1
818	sldi	$t1,$acc2,63
819	srdi	$acc2,$acc2,1
820	sldi	$t2,$acc3,63
821	srdi	$acc3,$acc3,1
822	sldi	$t3,$ap,63
823	or	$acc0,$acc0,$t0
824	or	$acc1,$acc1,$t1
825	or	$acc2,$acc2,$t2
826	or	$acc3,$acc3,$t3
827
828	std	$acc0,0($rp)
829	std	$acc1,8($rp)
830	std	$acc2,16($rp)
831	std	$acc3,24($rp)
832
833	blr
834	.long	0
835	.byte	0,12,0x14,0,0,0,1,0
836	.long	0
837.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
838___
839########################################################################
840# following subroutines are "literal" implementation of those found in
841# ecp_nistz256.c
842#
843########################################################################
844# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
845#
846if (1) {
847my $FRAME=64+32*4+12*8;
848my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3));
849# above map() describes stack layout with 4 temporary
850# 256-bit vectors on top.
851my ($rp_real,$ap_real) = map("r$_",(20,21));
852
853$code.=<<___;
854.globl	ecp_nistz256_point_double
855.align	5
856ecp_nistz256_point_double:
857	stdu	$sp,-$FRAME($sp)
858	mflr	r0
859	std	r20,$FRAME-8*12($sp)
860	std	r21,$FRAME-8*11($sp)
861	std	r22,$FRAME-8*10($sp)
862	std	r23,$FRAME-8*9($sp)
863	std	r24,$FRAME-8*8($sp)
864	std	r25,$FRAME-8*7($sp)
865	std	r26,$FRAME-8*6($sp)
866	std	r27,$FRAME-8*5($sp)
867	std	r28,$FRAME-8*4($sp)
868	std	r29,$FRAME-8*3($sp)
869	std	r30,$FRAME-8*2($sp)
870	std	r31,$FRAME-8*1($sp)
871
872	li	$poly1,-1
873	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
874	li	$poly3,1
875	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
876.Ldouble_shortcut:
877	ld	$acc0,32($ap)
878	ld	$acc1,40($ap)
879	ld	$acc2,48($ap)
880	ld	$acc3,56($ap)
881	mr	$t0,$acc0
882	mr	$t1,$acc1
883	mr	$t2,$acc2
884	mr	$t3,$acc3
885	 ld	$a0,64($ap)		# forward load for p256_sqr_mont
886	 ld	$a1,72($ap)
887	 ld	$a2,80($ap)
888	 ld	$a3,88($ap)
889	 mr	$rp_real,$rp
890	 mr	$ap_real,$ap
891	addi	$rp,$sp,$S
892	bl	__ecp_nistz256_add	# p256_mul_by_2(S, in_y);
893
894	addi	$rp,$sp,$Zsqr
895	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Zsqr, in_z);
896
897	ld	$t0,0($ap_real)
898	ld	$t1,8($ap_real)
899	ld	$t2,16($ap_real)
900	ld	$t3,24($ap_real)
901	mr	$a0,$acc0		# put Zsqr aside for p256_sub
902	mr	$a1,$acc1
903	mr	$a2,$acc2
904	mr	$a3,$acc3
905	addi	$rp,$sp,$M
906	bl	__ecp_nistz256_add	# p256_add(M, Zsqr, in_x);
907
908	addi	$bp,$ap_real,0
909	mr	$acc0,$a0		# restore Zsqr
910	mr	$acc1,$a1
911	mr	$acc2,$a2
912	mr	$acc3,$a3
913	 ld	$a0,$S+0($sp)		# forward load for p256_sqr_mont
914	 ld	$a1,$S+8($sp)
915	 ld	$a2,$S+16($sp)
916	 ld	$a3,$S+24($sp)
917	addi	$rp,$sp,$Zsqr
918	bl	__ecp_nistz256_sub_morf	# p256_sub(Zsqr, in_x, Zsqr);
919
920	addi	$rp,$sp,$S
921	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(S, S);
922
923	ld	$bi,32($ap_real)
924	ld	$a0,64($ap_real)
925	ld	$a1,72($ap_real)
926	ld	$a2,80($ap_real)
927	ld	$a3,88($ap_real)
928	addi	$bp,$ap_real,32
929	addi	$rp,$sp,$tmp0
930	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(tmp0, in_z, in_y);
931
932	mr	$t0,$acc0
933	mr	$t1,$acc1
934	mr	$t2,$acc2
935	mr	$t3,$acc3
936	 ld	$a0,$S+0($sp)		# forward load for p256_sqr_mont
937	 ld	$a1,$S+8($sp)
938	 ld	$a2,$S+16($sp)
939	 ld	$a3,$S+24($sp)
940	addi	$rp,$rp_real,64
941	bl	__ecp_nistz256_add	# p256_mul_by_2(res_z, tmp0);
942
943	addi	$rp,$sp,$tmp0
944	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(tmp0, S);
945
946	 ld	$bi,$Zsqr($sp)		# forward load for p256_mul_mont
947	 ld	$a0,$M+0($sp)
948	 ld	$a1,$M+8($sp)
949	 ld	$a2,$M+16($sp)
950	 ld	$a3,$M+24($sp)
951	addi	$rp,$rp_real,32
952	bl	__ecp_nistz256_div_by_2	# p256_div_by_2(res_y, tmp0);
953
954	addi	$bp,$sp,$Zsqr
955	addi	$rp,$sp,$M
956	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(M, M, Zsqr);
957
958	mr	$t0,$acc0		# duplicate M
959	mr	$t1,$acc1
960	mr	$t2,$acc2
961	mr	$t3,$acc3
962	mr	$a0,$acc0		# put M aside
963	mr	$a1,$acc1
964	mr	$a2,$acc2
965	mr	$a3,$acc3
966	addi	$rp,$sp,$M
967	bl	__ecp_nistz256_add
968	mr	$t0,$a0			# restore M
969	mr	$t1,$a1
970	mr	$t2,$a2
971	mr	$t3,$a3
972	 ld	$bi,0($ap_real)		# forward load for p256_mul_mont
973	 ld	$a0,$S+0($sp)
974	 ld	$a1,$S+8($sp)
975	 ld	$a2,$S+16($sp)
976	 ld	$a3,$S+24($sp)
977	bl	__ecp_nistz256_add	# p256_mul_by_3(M, M);
978
979	addi	$bp,$ap_real,0
980	addi	$rp,$sp,$S
981	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S, S, in_x);
982
983	mr	$t0,$acc0
984	mr	$t1,$acc1
985	mr	$t2,$acc2
986	mr	$t3,$acc3
987	 ld	$a0,$M+0($sp)		# forward load for p256_sqr_mont
988	 ld	$a1,$M+8($sp)
989	 ld	$a2,$M+16($sp)
990	 ld	$a3,$M+24($sp)
991	addi	$rp,$sp,$tmp0
992	bl	__ecp_nistz256_add	# p256_mul_by_2(tmp0, S);
993
994	addi	$rp,$rp_real,0
995	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(res_x, M);
996
997	addi	$bp,$sp,$tmp0
998	bl	__ecp_nistz256_sub_from	# p256_sub(res_x, res_x, tmp0);
999
1000	addi	$bp,$sp,$S
1001	addi	$rp,$sp,$S
1002	bl	__ecp_nistz256_sub_morf	# p256_sub(S, S, res_x);
1003
1004	ld	$bi,$M($sp)
1005	mr	$a0,$acc0		# copy S
1006	mr	$a1,$acc1
1007	mr	$a2,$acc2
1008	mr	$a3,$acc3
1009	addi	$bp,$sp,$M
1010	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S, S, M);
1011
1012	addi	$bp,$rp_real,32
1013	addi	$rp,$rp_real,32
1014	bl	__ecp_nistz256_sub_from	# p256_sub(res_y, S, res_y);
1015
1016	mtlr	r0
1017	ld	r20,$FRAME-8*12($sp)
1018	ld	r21,$FRAME-8*11($sp)
1019	ld	r22,$FRAME-8*10($sp)
1020	ld	r23,$FRAME-8*9($sp)
1021	ld	r24,$FRAME-8*8($sp)
1022	ld	r25,$FRAME-8*7($sp)
1023	ld	r26,$FRAME-8*6($sp)
1024	ld	r27,$FRAME-8*5($sp)
1025	ld	r28,$FRAME-8*4($sp)
1026	ld	r29,$FRAME-8*3($sp)
1027	ld	r30,$FRAME-8*2($sp)
1028	ld	r31,$FRAME-8*1($sp)
1029	addi	$sp,$sp,$FRAME
1030	blr
1031	.long	0
1032	.byte	0,12,4,0,0x80,12,2,0
1033	.long	0
1034.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
1035___
1036}
1037
1038########################################################################
1039# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1040#			      const P256_POINT *in2);
1041if (1) {
1042my $FRAME = 64 + 32*12 + 16*8;
1043my ($res_x,$res_y,$res_z,
1044    $H,$Hsqr,$R,$Rsqr,$Hcub,
1045    $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11));
1046my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1047# above map() describes stack layout with 12 temporary
1048# 256-bit vectors on top.
1049my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
1050
1051$code.=<<___;
1052.globl	ecp_nistz256_point_add
1053.align	5
1054ecp_nistz256_point_add:
1055	stdu	$sp,-$FRAME($sp)
1056	mflr	r0
1057	std	r16,$FRAME-8*16($sp)
1058	std	r17,$FRAME-8*15($sp)
1059	std	r18,$FRAME-8*14($sp)
1060	std	r19,$FRAME-8*13($sp)
1061	std	r20,$FRAME-8*12($sp)
1062	std	r21,$FRAME-8*11($sp)
1063	std	r22,$FRAME-8*10($sp)
1064	std	r23,$FRAME-8*9($sp)
1065	std	r24,$FRAME-8*8($sp)
1066	std	r25,$FRAME-8*7($sp)
1067	std	r26,$FRAME-8*6($sp)
1068	std	r27,$FRAME-8*5($sp)
1069	std	r28,$FRAME-8*4($sp)
1070	std	r29,$FRAME-8*3($sp)
1071	std	r30,$FRAME-8*2($sp)
1072	std	r31,$FRAME-8*1($sp)
1073
1074	li	$poly1,-1
1075	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
1076	li	$poly3,1
1077	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
1078
1079	ld	$a0,64($bp)		# in2_z
1080	ld	$a1,72($bp)
1081	ld	$a2,80($bp)
1082	ld	$a3,88($bp)
1083	 mr	$rp_real,$rp
1084	 mr	$ap_real,$ap
1085	 mr	$bp_real,$bp
1086	or	$t0,$a0,$a1
1087	or	$t2,$a2,$a3
1088	or	$in2infty,$t0,$t2
1089	neg	$t0,$in2infty
1090	or	$in2infty,$in2infty,$t0
1091	sradi	$in2infty,$in2infty,63	# !in2infty
1092	addi	$rp,$sp,$Z2sqr
1093	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Z2sqr, in2_z);
1094
1095	ld	$a0,64($ap_real)	# in1_z
1096	ld	$a1,72($ap_real)
1097	ld	$a2,80($ap_real)
1098	ld	$a3,88($ap_real)
1099	or	$t0,$a0,$a1
1100	or	$t2,$a2,$a3
1101	or	$in1infty,$t0,$t2
1102	neg	$t0,$in1infty
1103	or	$in1infty,$in1infty,$t0
1104	sradi	$in1infty,$in1infty,63	# !in1infty
1105	addi	$rp,$sp,$Z1sqr
1106	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Z1sqr, in1_z);
1107
1108	ld	$bi,64($bp_real)
1109	ld	$a0,$Z2sqr+0($sp)
1110	ld	$a1,$Z2sqr+8($sp)
1111	ld	$a2,$Z2sqr+16($sp)
1112	ld	$a3,$Z2sqr+24($sp)
1113	addi	$bp,$bp_real,64
1114	addi	$rp,$sp,$S1
1115	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S1, Z2sqr, in2_z);
1116
1117	ld	$bi,64($ap_real)
1118	ld	$a0,$Z1sqr+0($sp)
1119	ld	$a1,$Z1sqr+8($sp)
1120	ld	$a2,$Z1sqr+16($sp)
1121	ld	$a3,$Z1sqr+24($sp)
1122	addi	$bp,$ap_real,64
1123	addi	$rp,$sp,$S2
1124	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, Z1sqr, in1_z);
1125
1126	ld	$bi,32($ap_real)
1127	ld	$a0,$S1+0($sp)
1128	ld	$a1,$S1+8($sp)
1129	ld	$a2,$S1+16($sp)
1130	ld	$a3,$S1+24($sp)
1131	addi	$bp,$ap_real,32
1132	addi	$rp,$sp,$S1
1133	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S1, S1, in1_y);
1134
1135	ld	$bi,32($bp_real)
1136	ld	$a0,$S2+0($sp)
1137	ld	$a1,$S2+8($sp)
1138	ld	$a2,$S2+16($sp)
1139	ld	$a3,$S2+24($sp)
1140	addi	$bp,$bp_real,32
1141	addi	$rp,$sp,$S2
1142	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, S2, in2_y);
1143
1144	addi	$bp,$sp,$S1
1145	 ld	$bi,$Z2sqr($sp)		# forward load for p256_mul_mont
1146	 ld	$a0,0($ap_real)
1147	 ld	$a1,8($ap_real)
1148	 ld	$a2,16($ap_real)
1149	 ld	$a3,24($ap_real)
1150	addi	$rp,$sp,$R
1151	bl	__ecp_nistz256_sub_from	# p256_sub(R, S2, S1);
1152
1153	or	$acc0,$acc0,$acc1	# see if result is zero
1154	or	$acc2,$acc2,$acc3
1155	or	$temp,$acc0,$acc2
1156
1157	addi	$bp,$sp,$Z2sqr
1158	addi	$rp,$sp,$U1
1159	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(U1, in1_x, Z2sqr);
1160
1161	ld	$bi,$Z1sqr($sp)
1162	ld	$a0,0($bp_real)
1163	ld	$a1,8($bp_real)
1164	ld	$a2,16($bp_real)
1165	ld	$a3,24($bp_real)
1166	addi	$bp,$sp,$Z1sqr
1167	addi	$rp,$sp,$U2
1168	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(U2, in2_x, Z1sqr);
1169
1170	addi	$bp,$sp,$U1
1171	 ld	$a0,$R+0($sp)		# forward load for p256_sqr_mont
1172	 ld	$a1,$R+8($sp)
1173	 ld	$a2,$R+16($sp)
1174	 ld	$a3,$R+24($sp)
1175	addi	$rp,$sp,$H
1176	bl	__ecp_nistz256_sub_from	# p256_sub(H, U2, U1);
1177
1178	or	$acc0,$acc0,$acc1	# see if result is zero
1179	or	$acc2,$acc2,$acc3
1180	or.	$acc0,$acc0,$acc2
1181	bne	.Ladd_proceed		# is_equal(U1,U2)?
1182
1183	and.	$t0,$in1infty,$in2infty
1184	beq	.Ladd_proceed		# (in1infty || in2infty)?
1185
1186	cmpldi	$temp,0
1187	beq	.Ladd_double		# is_equal(S1,S2)?
1188
1189	xor	$a0,$a0,$a0
1190	std	$a0,0($rp_real)
1191	std	$a0,8($rp_real)
1192	std	$a0,16($rp_real)
1193	std	$a0,24($rp_real)
1194	std	$a0,32($rp_real)
1195	std	$a0,40($rp_real)
1196	std	$a0,48($rp_real)
1197	std	$a0,56($rp_real)
1198	std	$a0,64($rp_real)
1199	std	$a0,72($rp_real)
1200	std	$a0,80($rp_real)
1201	std	$a0,88($rp_real)
1202	b	.Ladd_done
1203
1204.align	4
1205.Ladd_double:
1206	ld	$bp,0($sp)		# back-link
1207	mr	$ap,$ap_real
1208	mr	$rp,$rp_real
1209	ld	r16,$FRAME-8*16($sp)
1210	ld	r17,$FRAME-8*15($sp)
1211	ld	r18,$FRAME-8*14($sp)
1212	ld	r19,$FRAME-8*13($sp)
1213	stdu	$bp,$FRAME-288($sp)	# difference in stack frame sizes
1214	b	.Ldouble_shortcut
1215
1216.align	4
1217.Ladd_proceed:
1218	addi	$rp,$sp,$Rsqr
1219	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Rsqr, R);
1220
1221	ld	$bi,64($ap_real)
1222	ld	$a0,$H+0($sp)
1223	ld	$a1,$H+8($sp)
1224	ld	$a2,$H+16($sp)
1225	ld	$a3,$H+24($sp)
1226	addi	$bp,$ap_real,64
1227	addi	$rp,$sp,$res_z
1228	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(res_z, H, in1_z);
1229
1230	ld	$a0,$H+0($sp)
1231	ld	$a1,$H+8($sp)
1232	ld	$a2,$H+16($sp)
1233	ld	$a3,$H+24($sp)
1234	addi	$rp,$sp,$Hsqr
1235	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Hsqr, H);
1236
1237	ld	$bi,64($bp_real)
1238	ld	$a0,$res_z+0($sp)
1239	ld	$a1,$res_z+8($sp)
1240	ld	$a2,$res_z+16($sp)
1241	ld	$a3,$res_z+24($sp)
1242	addi	$bp,$bp_real,64
1243	addi	$rp,$sp,$res_z
1244	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(res_z, res_z, in2_z);
1245
1246	ld	$bi,$H($sp)
1247	ld	$a0,$Hsqr+0($sp)
1248	ld	$a1,$Hsqr+8($sp)
1249	ld	$a2,$Hsqr+16($sp)
1250	ld	$a3,$Hsqr+24($sp)
1251	addi	$bp,$sp,$H
1252	addi	$rp,$sp,$Hcub
1253	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(Hcub, Hsqr, H);
1254
1255	ld	$bi,$Hsqr($sp)
1256	ld	$a0,$U1+0($sp)
1257	ld	$a1,$U1+8($sp)
1258	ld	$a2,$U1+16($sp)
1259	ld	$a3,$U1+24($sp)
1260	addi	$bp,$sp,$Hsqr
1261	addi	$rp,$sp,$U2
1262	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(U2, U1, Hsqr);
1263
1264	mr	$t0,$acc0
1265	mr	$t1,$acc1
1266	mr	$t2,$acc2
1267	mr	$t3,$acc3
1268	addi	$rp,$sp,$Hsqr
1269	bl	__ecp_nistz256_add	# p256_mul_by_2(Hsqr, U2);
1270
1271	addi	$bp,$sp,$Rsqr
1272	addi	$rp,$sp,$res_x
1273	bl	__ecp_nistz256_sub_morf	# p256_sub(res_x, Rsqr, Hsqr);
1274
1275	addi	$bp,$sp,$Hcub
1276	bl	__ecp_nistz256_sub_from	# p256_sub(res_x, res_x, Hcub);
1277
1278	addi	$bp,$sp,$U2
1279	 ld	$bi,$Hcub($sp)		# forward load for p256_mul_mont
1280	 ld	$a0,$S1+0($sp)
1281	 ld	$a1,$S1+8($sp)
1282	 ld	$a2,$S1+16($sp)
1283	 ld	$a3,$S1+24($sp)
1284	addi	$rp,$sp,$res_y
1285	bl	__ecp_nistz256_sub_morf	# p256_sub(res_y, U2, res_x);
1286
1287	addi	$bp,$sp,$Hcub
1288	addi	$rp,$sp,$S2
1289	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, S1, Hcub);
1290
1291	ld	$bi,$R($sp)
1292	ld	$a0,$res_y+0($sp)
1293	ld	$a1,$res_y+8($sp)
1294	ld	$a2,$res_y+16($sp)
1295	ld	$a3,$res_y+24($sp)
1296	addi	$bp,$sp,$R
1297	addi	$rp,$sp,$res_y
1298	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(res_y, res_y, R);
1299
1300	addi	$bp,$sp,$S2
1301	bl	__ecp_nistz256_sub_from	# p256_sub(res_y, res_y, S2);
1302
1303	ld	$t0,0($bp_real)		# in2
1304	ld	$t1,8($bp_real)
1305	ld	$t2,16($bp_real)
1306	ld	$t3,24($bp_real)
1307	ld	$a0,$res_x+0($sp)	# res
1308	ld	$a1,$res_x+8($sp)
1309	ld	$a2,$res_x+16($sp)
1310	ld	$a3,$res_x+24($sp)
1311___
1312for($i=0;$i<64;$i+=32) {		# conditional moves
1313$code.=<<___;
1314	ld	$acc0,$i+0($ap_real)	# in1
1315	ld	$acc1,$i+8($ap_real)
1316	ld	$acc2,$i+16($ap_real)
1317	ld	$acc3,$i+24($ap_real)
1318	andc	$t0,$t0,$in1infty
1319	andc	$t1,$t1,$in1infty
1320	andc	$t2,$t2,$in1infty
1321	andc	$t3,$t3,$in1infty
1322	and	$a0,$a0,$in1infty
1323	and	$a1,$a1,$in1infty
1324	and	$a2,$a2,$in1infty
1325	and	$a3,$a3,$in1infty
1326	or	$t0,$t0,$a0
1327	or	$t1,$t1,$a1
1328	or	$t2,$t2,$a2
1329	or	$t3,$t3,$a3
1330	andc	$acc0,$acc0,$in2infty
1331	andc	$acc1,$acc1,$in2infty
1332	andc	$acc2,$acc2,$in2infty
1333	andc	$acc3,$acc3,$in2infty
1334	and	$t0,$t0,$in2infty
1335	and	$t1,$t1,$in2infty
1336	and	$t2,$t2,$in2infty
1337	and	$t3,$t3,$in2infty
1338	or	$acc0,$acc0,$t0
1339	or	$acc1,$acc1,$t1
1340	or	$acc2,$acc2,$t2
1341	or	$acc3,$acc3,$t3
1342
1343	ld	$t0,$i+32($bp_real)	# in2
1344	ld	$t1,$i+40($bp_real)
1345	ld	$t2,$i+48($bp_real)
1346	ld	$t3,$i+56($bp_real)
1347	ld	$a0,$res_x+$i+32($sp)
1348	ld	$a1,$res_x+$i+40($sp)
1349	ld	$a2,$res_x+$i+48($sp)
1350	ld	$a3,$res_x+$i+56($sp)
1351	std	$acc0,$i+0($rp_real)
1352	std	$acc1,$i+8($rp_real)
1353	std	$acc2,$i+16($rp_real)
1354	std	$acc3,$i+24($rp_real)
1355___
1356}
1357$code.=<<___;
1358	ld	$acc0,$i+0($ap_real)	# in1
1359	ld	$acc1,$i+8($ap_real)
1360	ld	$acc2,$i+16($ap_real)
1361	ld	$acc3,$i+24($ap_real)
1362	andc	$t0,$t0,$in1infty
1363	andc	$t1,$t1,$in1infty
1364	andc	$t2,$t2,$in1infty
1365	andc	$t3,$t3,$in1infty
1366	and	$a0,$a0,$in1infty
1367	and	$a1,$a1,$in1infty
1368	and	$a2,$a2,$in1infty
1369	and	$a3,$a3,$in1infty
1370	or	$t0,$t0,$a0
1371	or	$t1,$t1,$a1
1372	or	$t2,$t2,$a2
1373	or	$t3,$t3,$a3
1374	andc	$acc0,$acc0,$in2infty
1375	andc	$acc1,$acc1,$in2infty
1376	andc	$acc2,$acc2,$in2infty
1377	andc	$acc3,$acc3,$in2infty
1378	and	$t0,$t0,$in2infty
1379	and	$t1,$t1,$in2infty
1380	and	$t2,$t2,$in2infty
1381	and	$t3,$t3,$in2infty
1382	or	$acc0,$acc0,$t0
1383	or	$acc1,$acc1,$t1
1384	or	$acc2,$acc2,$t2
1385	or	$acc3,$acc3,$t3
1386	std	$acc0,$i+0($rp_real)
1387	std	$acc1,$i+8($rp_real)
1388	std	$acc2,$i+16($rp_real)
1389	std	$acc3,$i+24($rp_real)
1390
1391.Ladd_done:
1392	mtlr	r0
1393	ld	r16,$FRAME-8*16($sp)
1394	ld	r17,$FRAME-8*15($sp)
1395	ld	r18,$FRAME-8*14($sp)
1396	ld	r19,$FRAME-8*13($sp)
1397	ld	r20,$FRAME-8*12($sp)
1398	ld	r21,$FRAME-8*11($sp)
1399	ld	r22,$FRAME-8*10($sp)
1400	ld	r23,$FRAME-8*9($sp)
1401	ld	r24,$FRAME-8*8($sp)
1402	ld	r25,$FRAME-8*7($sp)
1403	ld	r26,$FRAME-8*6($sp)
1404	ld	r27,$FRAME-8*5($sp)
1405	ld	r28,$FRAME-8*4($sp)
1406	ld	r29,$FRAME-8*3($sp)
1407	ld	r30,$FRAME-8*2($sp)
1408	ld	r31,$FRAME-8*1($sp)
1409	addi	$sp,$sp,$FRAME
1410	blr
1411	.long	0
1412	.byte	0,12,4,0,0x80,16,3,0
1413	.long	0
1414.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
1415___
1416}
1417
1418########################################################################
1419# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1420#				     const P256_POINT_AFFINE *in2);
1421if (1) {
1422my $FRAME = 64 + 32*10 + 16*8;
1423my ($res_x,$res_y,$res_z,
1424    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9));
1425my $Z1sqr = $S2;
1426# above map() describes stack layout with 10 temporary
1427# 256-bit vectors on top.
1428my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
1429
1430$code.=<<___;
1431.globl	ecp_nistz256_point_add_affine
1432.align	5
1433ecp_nistz256_point_add_affine:
1434	stdu	$sp,-$FRAME($sp)
1435	mflr	r0
1436	std	r16,$FRAME-8*16($sp)
1437	std	r17,$FRAME-8*15($sp)
1438	std	r18,$FRAME-8*14($sp)
1439	std	r19,$FRAME-8*13($sp)
1440	std	r20,$FRAME-8*12($sp)
1441	std	r21,$FRAME-8*11($sp)
1442	std	r22,$FRAME-8*10($sp)
1443	std	r23,$FRAME-8*9($sp)
1444	std	r24,$FRAME-8*8($sp)
1445	std	r25,$FRAME-8*7($sp)
1446	std	r26,$FRAME-8*6($sp)
1447	std	r27,$FRAME-8*5($sp)
1448	std	r28,$FRAME-8*4($sp)
1449	std	r29,$FRAME-8*3($sp)
1450	std	r30,$FRAME-8*2($sp)
1451	std	r31,$FRAME-8*1($sp)
1452
1453	li	$poly1,-1
1454	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
1455	li	$poly3,1
1456	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
1457
1458	mr	$rp_real,$rp
1459	mr	$ap_real,$ap
1460	mr	$bp_real,$bp
1461
1462	ld	$a0,64($ap)		# in1_z
1463	ld	$a1,72($ap)
1464	ld	$a2,80($ap)
1465	ld	$a3,88($ap)
1466	or	$t0,$a0,$a1
1467	or	$t2,$a2,$a3
1468	or	$in1infty,$t0,$t2
1469	neg	$t0,$in1infty
1470	or	$in1infty,$in1infty,$t0
1471	sradi	$in1infty,$in1infty,63	# !in1infty
1472
1473	ld	$acc0,0($bp)		# in2_x
1474	ld	$acc1,8($bp)
1475	ld	$acc2,16($bp)
1476	ld	$acc3,24($bp)
1477	ld	$t0,32($bp)		# in2_y
1478	ld	$t1,40($bp)
1479	ld	$t2,48($bp)
1480	ld	$t3,56($bp)
1481	or	$acc0,$acc0,$acc1
1482	or	$acc2,$acc2,$acc3
1483	or	$acc0,$acc0,$acc2
1484	or	$t0,$t0,$t1
1485	or	$t2,$t2,$t3
1486	or	$t0,$t0,$t2
1487	or	$in2infty,$acc0,$t0
1488	neg	$t0,$in2infty
1489	or	$in2infty,$in2infty,$t0
1490	sradi	$in2infty,$in2infty,63	# !in2infty
1491
1492	addi	$rp,$sp,$Z1sqr
1493	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Z1sqr, in1_z);
1494
1495	mr	$a0,$acc0
1496	mr	$a1,$acc1
1497	mr	$a2,$acc2
1498	mr	$a3,$acc3
1499	ld	$bi,0($bp_real)
1500	addi	$bp,$bp_real,0
1501	addi	$rp,$sp,$U2
1502	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(U2, Z1sqr, in2_x);
1503
1504	addi	$bp,$ap_real,0
1505	 ld	$bi,64($ap_real)	# forward load for p256_mul_mont
1506	 ld	$a0,$Z1sqr+0($sp)
1507	 ld	$a1,$Z1sqr+8($sp)
1508	 ld	$a2,$Z1sqr+16($sp)
1509	 ld	$a3,$Z1sqr+24($sp)
1510	addi	$rp,$sp,$H
1511	bl	__ecp_nistz256_sub_from	# p256_sub(H, U2, in1_x);
1512
1513	addi	$bp,$ap_real,64
1514	addi	$rp,$sp,$S2
1515	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, Z1sqr, in1_z);
1516
1517	ld	$bi,64($ap_real)
1518	ld	$a0,$H+0($sp)
1519	ld	$a1,$H+8($sp)
1520	ld	$a2,$H+16($sp)
1521	ld	$a3,$H+24($sp)
1522	addi	$bp,$ap_real,64
1523	addi	$rp,$sp,$res_z
1524	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(res_z, H, in1_z);
1525
1526	ld	$bi,32($bp_real)
1527	ld	$a0,$S2+0($sp)
1528	ld	$a1,$S2+8($sp)
1529	ld	$a2,$S2+16($sp)
1530	ld	$a3,$S2+24($sp)
1531	addi	$bp,$bp_real,32
1532	addi	$rp,$sp,$S2
1533	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, S2, in2_y);
1534
1535	addi	$bp,$ap_real,32
1536	 ld	$a0,$H+0($sp)		# forward load for p256_sqr_mont
1537	 ld	$a1,$H+8($sp)
1538	 ld	$a2,$H+16($sp)
1539	 ld	$a3,$H+24($sp)
1540	addi	$rp,$sp,$R
1541	bl	__ecp_nistz256_sub_from	# p256_sub(R, S2, in1_y);
1542
1543	addi	$rp,$sp,$Hsqr
1544	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Hsqr, H);
1545
1546	ld	$a0,$R+0($sp)
1547	ld	$a1,$R+8($sp)
1548	ld	$a2,$R+16($sp)
1549	ld	$a3,$R+24($sp)
1550	addi	$rp,$sp,$Rsqr
1551	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Rsqr, R);
1552
1553	ld	$bi,$H($sp)
1554	ld	$a0,$Hsqr+0($sp)
1555	ld	$a1,$Hsqr+8($sp)
1556	ld	$a2,$Hsqr+16($sp)
1557	ld	$a3,$Hsqr+24($sp)
1558	addi	$bp,$sp,$H
1559	addi	$rp,$sp,$Hcub
1560	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(Hcub, Hsqr, H);
1561
1562	ld	$bi,0($ap_real)
1563	ld	$a0,$Hsqr+0($sp)
1564	ld	$a1,$Hsqr+8($sp)
1565	ld	$a2,$Hsqr+16($sp)
1566	ld	$a3,$Hsqr+24($sp)
1567	addi	$bp,$ap_real,0
1568	addi	$rp,$sp,$U2
1569	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(U2, in1_x, Hsqr);
1570
1571	mr	$t0,$acc0
1572	mr	$t1,$acc1
1573	mr	$t2,$acc2
1574	mr	$t3,$acc3
1575	addi	$rp,$sp,$Hsqr
1576	bl	__ecp_nistz256_add	# p256_mul_by_2(Hsqr, U2);
1577
1578	addi	$bp,$sp,$Rsqr
1579	addi	$rp,$sp,$res_x
1580	bl	__ecp_nistz256_sub_morf	# p256_sub(res_x, Rsqr, Hsqr);
1581
1582	addi	$bp,$sp,$Hcub
1583	bl	__ecp_nistz256_sub_from	#  p256_sub(res_x, res_x, Hcub);
1584
1585	addi	$bp,$sp,$U2
1586	 ld	$bi,32($ap_real)	# forward load for p256_mul_mont
1587	 ld	$a0,$Hcub+0($sp)
1588	 ld	$a1,$Hcub+8($sp)
1589	 ld	$a2,$Hcub+16($sp)
1590	 ld	$a3,$Hcub+24($sp)
1591	addi	$rp,$sp,$res_y
1592	bl	__ecp_nistz256_sub_morf	# p256_sub(res_y, U2, res_x);
1593
1594	addi	$bp,$ap_real,32
1595	addi	$rp,$sp,$S2
1596	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, in1_y, Hcub);
1597
1598	ld	$bi,$R($sp)
1599	ld	$a0,$res_y+0($sp)
1600	ld	$a1,$res_y+8($sp)
1601	ld	$a2,$res_y+16($sp)
1602	ld	$a3,$res_y+24($sp)
1603	addi	$bp,$sp,$R
1604	addi	$rp,$sp,$res_y
1605	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(res_y, res_y, R);
1606
1607	addi	$bp,$sp,$S2
1608	bl	__ecp_nistz256_sub_from	# p256_sub(res_y, res_y, S2);
1609
1610	ld	$t0,0($bp_real)		# in2
1611	ld	$t1,8($bp_real)
1612	ld	$t2,16($bp_real)
1613	ld	$t3,24($bp_real)
1614	ld	$a0,$res_x+0($sp)	# res
1615	ld	$a1,$res_x+8($sp)
1616	ld	$a2,$res_x+16($sp)
1617	ld	$a3,$res_x+24($sp)
1618___
1619for($i=0;$i<64;$i+=32) {		# conditional moves
1620$code.=<<___;
1621	ld	$acc0,$i+0($ap_real)	# in1
1622	ld	$acc1,$i+8($ap_real)
1623	ld	$acc2,$i+16($ap_real)
1624	ld	$acc3,$i+24($ap_real)
1625	andc	$t0,$t0,$in1infty
1626	andc	$t1,$t1,$in1infty
1627	andc	$t2,$t2,$in1infty
1628	andc	$t3,$t3,$in1infty
1629	and	$a0,$a0,$in1infty
1630	and	$a1,$a1,$in1infty
1631	and	$a2,$a2,$in1infty
1632	and	$a3,$a3,$in1infty
1633	or	$t0,$t0,$a0
1634	or	$t1,$t1,$a1
1635	or	$t2,$t2,$a2
1636	or	$t3,$t3,$a3
1637	andc	$acc0,$acc0,$in2infty
1638	andc	$acc1,$acc1,$in2infty
1639	andc	$acc2,$acc2,$in2infty
1640	andc	$acc3,$acc3,$in2infty
1641	and	$t0,$t0,$in2infty
1642	and	$t1,$t1,$in2infty
1643	and	$t2,$t2,$in2infty
1644	and	$t3,$t3,$in2infty
1645	or	$acc0,$acc0,$t0
1646	or	$acc1,$acc1,$t1
1647	or	$acc2,$acc2,$t2
1648	or	$acc3,$acc3,$t3
1649___
1650$code.=<<___	if ($i==0);
1651	ld	$t0,32($bp_real)	# in2
1652	ld	$t1,40($bp_real)
1653	ld	$t2,48($bp_real)
1654	ld	$t3,56($bp_real)
1655___
1656$code.=<<___	if ($i==32);
1657	li	$t0,1			# Lone_mont
1658	not	$t1,$poly1
1659	li	$t2,-1
1660	not	$t3,$poly3
1661___
1662$code.=<<___;
1663	ld	$a0,$res_x+$i+32($sp)
1664	ld	$a1,$res_x+$i+40($sp)
1665	ld	$a2,$res_x+$i+48($sp)
1666	ld	$a3,$res_x+$i+56($sp)
1667	std	$acc0,$i+0($rp_real)
1668	std	$acc1,$i+8($rp_real)
1669	std	$acc2,$i+16($rp_real)
1670	std	$acc3,$i+24($rp_real)
1671___
1672}
1673$code.=<<___;
1674	ld	$acc0,$i+0($ap_real)	# in1
1675	ld	$acc1,$i+8($ap_real)
1676	ld	$acc2,$i+16($ap_real)
1677	ld	$acc3,$i+24($ap_real)
1678	andc	$t0,$t0,$in1infty
1679	andc	$t1,$t1,$in1infty
1680	andc	$t2,$t2,$in1infty
1681	andc	$t3,$t3,$in1infty
1682	and	$a0,$a0,$in1infty
1683	and	$a1,$a1,$in1infty
1684	and	$a2,$a2,$in1infty
1685	and	$a3,$a3,$in1infty
1686	or	$t0,$t0,$a0
1687	or	$t1,$t1,$a1
1688	or	$t2,$t2,$a2
1689	or	$t3,$t3,$a3
1690	andc	$acc0,$acc0,$in2infty
1691	andc	$acc1,$acc1,$in2infty
1692	andc	$acc2,$acc2,$in2infty
1693	andc	$acc3,$acc3,$in2infty
1694	and	$t0,$t0,$in2infty
1695	and	$t1,$t1,$in2infty
1696	and	$t2,$t2,$in2infty
1697	and	$t3,$t3,$in2infty
1698	or	$acc0,$acc0,$t0
1699	or	$acc1,$acc1,$t1
1700	or	$acc2,$acc2,$t2
1701	or	$acc3,$acc3,$t3
1702	std	$acc0,$i+0($rp_real)
1703	std	$acc1,$i+8($rp_real)
1704	std	$acc2,$i+16($rp_real)
1705	std	$acc3,$i+24($rp_real)
1706
1707	mtlr	r0
1708	ld	r16,$FRAME-8*16($sp)
1709	ld	r17,$FRAME-8*15($sp)
1710	ld	r18,$FRAME-8*14($sp)
1711	ld	r19,$FRAME-8*13($sp)
1712	ld	r20,$FRAME-8*12($sp)
1713	ld	r21,$FRAME-8*11($sp)
1714	ld	r22,$FRAME-8*10($sp)
1715	ld	r23,$FRAME-8*9($sp)
1716	ld	r24,$FRAME-8*8($sp)
1717	ld	r25,$FRAME-8*7($sp)
1718	ld	r26,$FRAME-8*6($sp)
1719	ld	r27,$FRAME-8*5($sp)
1720	ld	r28,$FRAME-8*4($sp)
1721	ld	r29,$FRAME-8*3($sp)
1722	ld	r30,$FRAME-8*2($sp)
1723	ld	r31,$FRAME-8*1($sp)
1724	addi	$sp,$sp,$FRAME
1725	blr
1726	.long	0
1727	.byte	0,12,4,0,0x80,16,3,0
1728	.long	0
1729.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1730___
1731}
1732if (1) {
1733my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21));
1734my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0");
1735
1736$code.=<<___;
1737########################################################################
1738# void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1739#                                uint64_t b[4]);
1740.globl	ecp_nistz256_ord_mul_mont
1741.align	5
1742ecp_nistz256_ord_mul_mont:
1743	stdu	$sp,-160($sp)
1744	std	r18,48($sp)
1745	std	r19,56($sp)
1746	std	r20,64($sp)
1747	std	r21,72($sp)
1748	std	r22,80($sp)
1749	std	r23,88($sp)
1750	std	r24,96($sp)
1751	std	r25,104($sp)
1752	std	r26,112($sp)
1753	std	r27,120($sp)
1754	std	r28,128($sp)
1755	std	r29,136($sp)
1756	std	r30,144($sp)
1757	std	r31,152($sp)
1758
1759	ld	$a0,0($ap)
1760	ld	$bi,0($bp)
1761	ld	$a1,8($ap)
1762	ld	$a2,16($ap)
1763	ld	$a3,24($ap)
1764
1765	lis	$ordk,0xccd1
1766	lis	$ord0,0xf3b9
1767	lis	$ord1,0xbce6
1768	ori	$ordk,$ordk,0xc8aa
1769	ori	$ord0,$ord0,0xcac2
1770	ori	$ord1,$ord1,0xfaad
1771	sldi	$ordk,$ordk,32
1772	sldi	$ord0,$ord0,32
1773	sldi	$ord1,$ord1,32
1774	oris	$ordk,$ordk,0xee00
1775	oris	$ord0,$ord0,0xfc63
1776	oris	$ord1,$ord1,0xa717
1777	ori	$ordk,$ordk,0xbc4f	# 0xccd1c8aaee00bc4f
1778	ori	$ord0,$ord0,0x2551	# 0xf3b9cac2fc632551
1779	ori	$ord1,$ord1,0x9e84	# 0xbce6faada7179e84
1780	li	$ord2,-1		# 0xffffffffffffffff
1781	sldi	$ord3,$ord2,32		# 0xffffffff00000000
1782	li	$zr,0
1783
1784	mulld	$acc0,$a0,$bi		# a[0]*b[0]
1785	mulhdu	$t0,$a0,$bi
1786
1787	mulld	$acc1,$a1,$bi		# a[1]*b[0]
1788	mulhdu	$t1,$a1,$bi
1789
1790	mulld	$acc2,$a2,$bi		# a[2]*b[0]
1791	mulhdu	$t2,$a2,$bi
1792
1793	mulld	$acc3,$a3,$bi		# a[3]*b[0]
1794	mulhdu	$acc4,$a3,$bi
1795
1796	mulld	$t4,$acc0,$ordk
1797
1798	addc	$acc1,$acc1,$t0		# accumulate high parts of multiplication
1799	adde	$acc2,$acc2,$t1
1800	adde	$acc3,$acc3,$t2
1801	addze	$acc4,$acc4
1802	li	$acc5,0
1803___
1804for ($i=1;$i<4;$i++) {
1805	################################################################
1806	#            ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1807	# *                                     abcdefgh
1808	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1809	#
1810	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1811	# rewrite above as:
1812	#
1813	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1814	# - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1815	# + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1816$code.=<<___;
1817	ld	$bi,8*$i($bp)		# b[i]
1818
1819	sldi	$t0,$t4,32
1820	subfc	$acc2,$t4,$acc2
1821	srdi	$t1,$t4,32
1822	subfe	$acc3,$t0,$acc3
1823	subfe	$acc4,$t1,$acc4
1824	subfe	$acc5,$zr,$acc5
1825
1826	addic	$t0,$acc0,-1		# discarded
1827	mulhdu	$t1,$ord0,$t4
1828	mulld	$t2,$ord1,$t4
1829	mulhdu	$t3,$ord1,$t4
1830
1831	adde	$t2,$t2,$t1
1832	 mulld	$t0,$a0,$bi
1833	addze	$t3,$t3
1834	 mulld	$t1,$a1,$bi
1835
1836	addc	$acc0,$acc1,$t2
1837	 mulld	$t2,$a2,$bi
1838	adde	$acc1,$acc2,$t3
1839	 mulld	$t3,$a3,$bi
1840	adde	$acc2,$acc3,$t4
1841	adde	$acc3,$acc4,$t4
1842	addze	$acc4,$acc5
1843
1844	addc	$acc0,$acc0,$t0		# accumulate low parts
1845	mulhdu	$t0,$a0,$bi
1846	adde	$acc1,$acc1,$t1
1847	mulhdu	$t1,$a1,$bi
1848	adde	$acc2,$acc2,$t2
1849	mulhdu	$t2,$a2,$bi
1850	adde	$acc3,$acc3,$t3
1851	mulhdu	$t3,$a3,$bi
1852	addze	$acc4,$acc4
1853	mulld	$t4,$acc0,$ordk
1854	addc	$acc1,$acc1,$t0		# accumulate high parts
1855	adde	$acc2,$acc2,$t1
1856	adde	$acc3,$acc3,$t2
1857	adde	$acc4,$acc4,$t3
1858	addze	$acc5,$zr
1859___
1860}
1861$code.=<<___;
1862	sldi	$t0,$t4,32		# last reduction
1863	subfc	$acc2,$t4,$acc2
1864	srdi	$t1,$t4,32
1865	subfe	$acc3,$t0,$acc3
1866	subfe	$acc4,$t1,$acc4
1867	subfe	$acc5,$zr,$acc5
1868
1869	addic	$t0,$acc0,-1		# discarded
1870	mulhdu	$t1,$ord0,$t4
1871	mulld	$t2,$ord1,$t4
1872	mulhdu	$t3,$ord1,$t4
1873
1874	adde	$t2,$t2,$t1
1875	addze	$t3,$t3
1876
1877	addc	$acc0,$acc1,$t2
1878	adde	$acc1,$acc2,$t3
1879	adde	$acc2,$acc3,$t4
1880	adde	$acc3,$acc4,$t4
1881	addze	$acc4,$acc5
1882
1883	subfc	$acc0,$ord0,$acc0	# ret -= modulus
1884	subfe	$acc1,$ord1,$acc1
1885	subfe	$acc2,$ord2,$acc2
1886	subfe	$acc3,$ord3,$acc3
1887	subfe	$acc4,$zr,$acc4
1888
1889	and	$t0,$ord0,$acc4
1890	and	$t1,$ord1,$acc4
1891	addc	$acc0,$acc0,$t0		# ret += modulus if borrow
1892	and	$t3,$ord3,$acc4
1893	adde	$acc1,$acc1,$t1
1894	adde	$acc2,$acc2,$acc4
1895	adde	$acc3,$acc3,$t3
1896
1897	std	$acc0,0($rp)
1898	std	$acc1,8($rp)
1899	std	$acc2,16($rp)
1900	std	$acc3,24($rp)
1901
1902	ld	r18,48($sp)
1903	ld	r19,56($sp)
1904	ld	r20,64($sp)
1905	ld	r21,72($sp)
1906	ld	r22,80($sp)
1907	ld	r23,88($sp)
1908	ld	r24,96($sp)
1909	ld	r25,104($sp)
1910	ld	r26,112($sp)
1911	ld	r27,120($sp)
1912	ld	r28,128($sp)
1913	ld	r29,136($sp)
1914	ld	r30,144($sp)
1915	ld	r31,152($sp)
1916	addi	$sp,$sp,160
1917	blr
1918	.long	0
1919	.byte	0,12,4,0,0x80,14,3,0
1920	.long	0
1921.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1922
1923################################################################################
1924# void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1925#                                uint64_t rep);
1926.globl	ecp_nistz256_ord_sqr_mont
1927.align	5
1928ecp_nistz256_ord_sqr_mont:
1929	stdu	$sp,-160($sp)
1930	std	r18,48($sp)
1931	std	r19,56($sp)
1932	std	r20,64($sp)
1933	std	r21,72($sp)
1934	std	r22,80($sp)
1935	std	r23,88($sp)
1936	std	r24,96($sp)
1937	std	r25,104($sp)
1938	std	r26,112($sp)
1939	std	r27,120($sp)
1940	std	r28,128($sp)
1941	std	r29,136($sp)
1942	std	r30,144($sp)
1943	std	r31,152($sp)
1944
1945	mtctr	$bp
1946
1947	ld	$a0,0($ap)
1948	ld	$a1,8($ap)
1949	ld	$a2,16($ap)
1950	ld	$a3,24($ap)
1951
1952	lis	$ordk,0xccd1
1953	lis	$ord0,0xf3b9
1954	lis	$ord1,0xbce6
1955	ori	$ordk,$ordk,0xc8aa
1956	ori	$ord0,$ord0,0xcac2
1957	ori	$ord1,$ord1,0xfaad
1958	sldi	$ordk,$ordk,32
1959	sldi	$ord0,$ord0,32
1960	sldi	$ord1,$ord1,32
1961	oris	$ordk,$ordk,0xee00
1962	oris	$ord0,$ord0,0xfc63
1963	oris	$ord1,$ord1,0xa717
1964	ori	$ordk,$ordk,0xbc4f	# 0xccd1c8aaee00bc4f
1965	ori	$ord0,$ord0,0x2551	# 0xf3b9cac2fc632551
1966	ori	$ord1,$ord1,0x9e84	# 0xbce6faada7179e84
1967	li	$ord2,-1		# 0xffffffffffffffff
1968	sldi	$ord3,$ord2,32		# 0xffffffff00000000
1969	li	$zr,0
1970	b	.Loop_ord_sqr
1971
1972.align	5
1973.Loop_ord_sqr:
1974	################################################################
1975	#  |  |  |  |  |  |a1*a0|  |
1976	#  |  |  |  |  |a2*a0|  |  |
1977	#  |  |a3*a2|a3*a0|  |  |  |
1978	#  |  |  |  |a2*a1|  |  |  |
1979	#  |  |  |a3*a1|  |  |  |  |
1980	# *|  |  |  |  |  |  |  | 2|
1981	# +|a3*a3|a2*a2|a1*a1|a0*a0|
1982	#  |--+--+--+--+--+--+--+--|
1983	#  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1984	#
1985	#  "can't overflow" below mark carrying into high part of
1986	#  multiplication result, which can't overflow, because it
1987	#  can never be all ones.
1988
1989	mulld	$acc1,$a1,$a0		# a[1]*a[0]
1990	mulhdu	$t1,$a1,$a0
1991	mulld	$acc2,$a2,$a0		# a[2]*a[0]
1992	mulhdu	$t2,$a2,$a0
1993	mulld	$acc3,$a3,$a0		# a[3]*a[0]
1994	mulhdu	$acc4,$a3,$a0
1995
1996	addc	$acc2,$acc2,$t1		# accumulate high parts of multiplication
1997	 mulld	$t0,$a2,$a1		# a[2]*a[1]
1998	 mulhdu	$t1,$a2,$a1
1999	adde	$acc3,$acc3,$t2
2000	 mulld	$t2,$a3,$a1		# a[3]*a[1]
2001	 mulhdu	$t3,$a3,$a1
2002	addze	$acc4,$acc4		# can't overflow
2003
2004	mulld	$acc5,$a3,$a2		# a[3]*a[2]
2005	mulhdu	$acc6,$a3,$a2
2006
2007	addc	$t1,$t1,$t2		# accumulate high parts of multiplication
2008	 mulld	$acc0,$a0,$a0		# a[0]*a[0]
2009	addze	$t2,$t3			# can't overflow
2010
2011	addc	$acc3,$acc3,$t0		# accumulate low parts of multiplication
2012	 mulhdu	$a0,$a0,$a0
2013	adde	$acc4,$acc4,$t1
2014	 mulld	$t1,$a1,$a1		# a[1]*a[1]
2015	adde	$acc5,$acc5,$t2
2016	 mulhdu	$a1,$a1,$a1
2017	addze	$acc6,$acc6		# can't overflow
2018
2019	addc	$acc1,$acc1,$acc1	# acc[1-6]*=2
2020	 mulld	$t2,$a2,$a2		# a[2]*a[2]
2021	adde	$acc2,$acc2,$acc2
2022	 mulhdu	$a2,$a2,$a2
2023	adde	$acc3,$acc3,$acc3
2024	 mulld	$t3,$a3,$a3		# a[3]*a[3]
2025	adde	$acc4,$acc4,$acc4
2026	 mulhdu	$a3,$a3,$a3
2027	adde	$acc5,$acc5,$acc5
2028	adde	$acc6,$acc6,$acc6
2029	addze	$acc7,$zr
2030
2031	addc	$acc1,$acc1,$a0		# +a[i]*a[i]
2032	 mulld	$t4,$acc0,$ordk
2033	adde	$acc2,$acc2,$t1
2034	adde	$acc3,$acc3,$a1
2035	adde	$acc4,$acc4,$t2
2036	adde	$acc5,$acc5,$a2
2037	adde	$acc6,$acc6,$t3
2038	adde	$acc7,$acc7,$a3
2039___
2040for($i=0; $i<4; $i++) {			# reductions
2041$code.=<<___;
2042	addic	$t0,$acc0,-1		# discarded
2043	mulhdu	$t1,$ord0,$t4
2044	mulld	$t2,$ord1,$t4
2045	mulhdu	$t3,$ord1,$t4
2046
2047	adde	$t2,$t2,$t1
2048	addze	$t3,$t3
2049
2050	addc	$acc0,$acc1,$t2
2051	adde	$acc1,$acc2,$t3
2052	adde	$acc2,$acc3,$t4
2053	adde	$acc3,$zr,$t4		# can't overflow
2054___
2055$code.=<<___	if ($i<3);
2056	mulld	$t3,$acc0,$ordk
2057___
2058$code.=<<___;
2059	sldi	$t0,$t4,32
2060	subfc	$acc1,$t4,$acc1
2061	srdi	$t1,$t4,32
2062	subfe	$acc2,$t0,$acc2
2063	subfe	$acc3,$t1,$acc3		# can't borrow
2064___
2065	($t3,$t4) = ($t4,$t3);
2066}
2067$code.=<<___;
2068	addc	$acc0,$acc0,$acc4	# accumulate upper half
2069	adde	$acc1,$acc1,$acc5
2070	adde	$acc2,$acc2,$acc6
2071	adde	$acc3,$acc3,$acc7
2072	addze	$acc4,$zr
2073
2074	subfc	$acc0,$ord0,$acc0	# ret -= modulus
2075	subfe	$acc1,$ord1,$acc1
2076	subfe	$acc2,$ord2,$acc2
2077	subfe	$acc3,$ord3,$acc3
2078	subfe	$acc4,$zr,$acc4
2079
2080	and	$t0,$ord0,$acc4
2081	and	$t1,$ord1,$acc4
2082	addc	$a0,$acc0,$t0		# ret += modulus if borrow
2083	and	$t3,$ord3,$acc4
2084	adde	$a1,$acc1,$t1
2085	adde	$a2,$acc2,$acc4
2086	adde	$a3,$acc3,$t3
2087
2088	bdnz	.Loop_ord_sqr
2089
2090	std	$a0,0($rp)
2091	std	$a1,8($rp)
2092	std	$a2,16($rp)
2093	std	$a3,24($rp)
2094
2095	ld	r18,48($sp)
2096	ld	r19,56($sp)
2097	ld	r20,64($sp)
2098	ld	r21,72($sp)
2099	ld	r22,80($sp)
2100	ld	r23,88($sp)
2101	ld	r24,96($sp)
2102	ld	r25,104($sp)
2103	ld	r26,112($sp)
2104	ld	r27,120($sp)
2105	ld	r28,128($sp)
2106	ld	r29,136($sp)
2107	ld	r30,144($sp)
2108	ld	r31,152($sp)
2109	addi	$sp,$sp,160
2110	blr
2111	.long	0
2112	.byte	0,12,4,0,0x80,14,3,0
2113	.long	0
2114.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
2115___
2116}	}
2117
2118########################################################################
2119# scatter-gather subroutines
2120{
2121my ($out,$inp,$index,$mask)=map("r$_",(3..7));
2122$code.=<<___;
2123########################################################################
2124# void	ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp,
2125#				int index);
2126.globl	ecp_nistz256_scatter_w5
2127.align	4
2128ecp_nistz256_scatter_w5:
2129	slwi	$index,$index,2
2130	add	$out,$out,$index
2131
2132	ld	r8, 0($inp)		# X
2133	ld	r9, 8($inp)
2134	ld	r10,16($inp)
2135	ld	r11,24($inp)
2136
2137	stw	r8, 64*0-4($out)
2138	srdi	r8, r8, 32
2139	stw	r9, 64*1-4($out)
2140	srdi	r9, r9, 32
2141	stw	r10,64*2-4($out)
2142	srdi	r10,r10,32
2143	stw	r11,64*3-4($out)
2144	srdi	r11,r11,32
2145	stw	r8, 64*4-4($out)
2146	stw	r9, 64*5-4($out)
2147	stw	r10,64*6-4($out)
2148	stw	r11,64*7-4($out)
2149	addi	$out,$out,64*8
2150
2151	ld	r8, 32($inp)		# Y
2152	ld	r9, 40($inp)
2153	ld	r10,48($inp)
2154	ld	r11,56($inp)
2155
2156	stw	r8, 64*0-4($out)
2157	srdi	r8, r8, 32
2158	stw	r9, 64*1-4($out)
2159	srdi	r9, r9, 32
2160	stw	r10,64*2-4($out)
2161	srdi	r10,r10,32
2162	stw	r11,64*3-4($out)
2163	srdi	r11,r11,32
2164	stw	r8, 64*4-4($out)
2165	stw	r9, 64*5-4($out)
2166	stw	r10,64*6-4($out)
2167	stw	r11,64*7-4($out)
2168	addi	$out,$out,64*8
2169
2170	ld	r8, 64($inp)		# Z
2171	ld	r9, 72($inp)
2172	ld	r10,80($inp)
2173	ld	r11,88($inp)
2174
2175	stw	r8, 64*0-4($out)
2176	srdi	r8, r8, 32
2177	stw	r9, 64*1-4($out)
2178	srdi	r9, r9, 32
2179	stw	r10,64*2-4($out)
2180	srdi	r10,r10,32
2181	stw	r11,64*3-4($out)
2182	srdi	r11,r11,32
2183	stw	r8, 64*4-4($out)
2184	stw	r9, 64*5-4($out)
2185	stw	r10,64*6-4($out)
2186	stw	r11,64*7-4($out)
2187
2188	blr
2189	.long	0
2190	.byte	0,12,0x14,0,0,0,3,0
2191	.long	0
2192.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
2193
2194########################################################################
2195# void	ecp_nistz256_gather_w5(P256_POINT *out, const void *inp,
2196#				int index);
2197.globl	ecp_nistz256_gather_w5
2198.align	4
2199ecp_nistz256_gather_w5:
2200	neg	r0,$index
2201	sradi	r0,r0,63
2202
2203	add	$index,$index,r0
2204	slwi	$index,$index,2
2205	add	$inp,$inp,$index
2206
2207	lwz	r5, 64*0($inp)
2208	lwz	r6, 64*1($inp)
2209	lwz	r7, 64*2($inp)
2210	lwz	r8, 64*3($inp)
2211	lwz	r9, 64*4($inp)
2212	lwz	r10,64*5($inp)
2213	lwz	r11,64*6($inp)
2214	lwz	r12,64*7($inp)
2215	addi	$inp,$inp,64*8
2216	sldi	r9, r9, 32
2217	sldi	r10,r10,32
2218	sldi	r11,r11,32
2219	sldi	r12,r12,32
2220	or	r5,r5,r9
2221	or	r6,r6,r10
2222	or	r7,r7,r11
2223	or	r8,r8,r12
2224	and	r5,r5,r0
2225	and	r6,r6,r0
2226	and	r7,r7,r0
2227	and	r8,r8,r0
2228	std	r5,0($out)		# X
2229	std	r6,8($out)
2230	std	r7,16($out)
2231	std	r8,24($out)
2232
2233	lwz	r5, 64*0($inp)
2234	lwz	r6, 64*1($inp)
2235	lwz	r7, 64*2($inp)
2236	lwz	r8, 64*3($inp)
2237	lwz	r9, 64*4($inp)
2238	lwz	r10,64*5($inp)
2239	lwz	r11,64*6($inp)
2240	lwz	r12,64*7($inp)
2241	addi	$inp,$inp,64*8
2242	sldi	r9, r9, 32
2243	sldi	r10,r10,32
2244	sldi	r11,r11,32
2245	sldi	r12,r12,32
2246	or	r5,r5,r9
2247	or	r6,r6,r10
2248	or	r7,r7,r11
2249	or	r8,r8,r12
2250	and	r5,r5,r0
2251	and	r6,r6,r0
2252	and	r7,r7,r0
2253	and	r8,r8,r0
2254	std	r5,32($out)		# Y
2255	std	r6,40($out)
2256	std	r7,48($out)
2257	std	r8,56($out)
2258
2259	lwz	r5, 64*0($inp)
2260	lwz	r6, 64*1($inp)
2261	lwz	r7, 64*2($inp)
2262	lwz	r8, 64*3($inp)
2263	lwz	r9, 64*4($inp)
2264	lwz	r10,64*5($inp)
2265	lwz	r11,64*6($inp)
2266	lwz	r12,64*7($inp)
2267	sldi	r9, r9, 32
2268	sldi	r10,r10,32
2269	sldi	r11,r11,32
2270	sldi	r12,r12,32
2271	or	r5,r5,r9
2272	or	r6,r6,r10
2273	or	r7,r7,r11
2274	or	r8,r8,r12
2275	and	r5,r5,r0
2276	and	r6,r6,r0
2277	and	r7,r7,r0
2278	and	r8,r8,r0
2279	std	r5,64($out)		# Z
2280	std	r6,72($out)
2281	std	r7,80($out)
2282	std	r8,88($out)
2283
2284	blr
2285	.long	0
2286	.byte	0,12,0x14,0,0,0,3,0
2287	.long	0
2288.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
2289
2290########################################################################
2291# void	ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp,
2292#				int index);
2293.globl	ecp_nistz256_scatter_w7
2294.align	4
2295ecp_nistz256_scatter_w7:
2296	li	r0,8
2297	mtctr	r0
2298	add	$out,$out,$index
2299	subi	$inp,$inp,8
2300
2301.Loop_scatter_w7:
2302	ldu	r0,8($inp)
2303	stb	r0,64*0($out)
2304	srdi	r0,r0,8
2305	stb	r0,64*1($out)
2306	srdi	r0,r0,8
2307	stb	r0,64*2($out)
2308	srdi	r0,r0,8
2309	stb	r0,64*3($out)
2310	srdi	r0,r0,8
2311	stb	r0,64*4($out)
2312	srdi	r0,r0,8
2313	stb	r0,64*5($out)
2314	srdi	r0,r0,8
2315	stb	r0,64*6($out)
2316	srdi	r0,r0,8
2317	stb	r0,64*7($out)
2318	addi	$out,$out,64*8
2319	bdnz	.Loop_scatter_w7
2320
2321	blr
2322	.long	0
2323	.byte	0,12,0x14,0,0,0,3,0
2324	.long	0
2325.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
2326
2327########################################################################
2328# void	ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp,
2329#				int index);
2330.globl	ecp_nistz256_gather_w7
2331.align	4
2332ecp_nistz256_gather_w7:
2333	li	r0,8
2334	mtctr	r0
2335	neg	r0,$index
2336	sradi	r0,r0,63
2337
2338	add	$index,$index,r0
2339	add	$inp,$inp,$index
2340	subi	$out,$out,8
2341
2342.Loop_gather_w7:
2343	lbz	r5, 64*0($inp)
2344	lbz	r6, 64*1($inp)
2345	lbz	r7, 64*2($inp)
2346	lbz	r8, 64*3($inp)
2347	lbz	r9, 64*4($inp)
2348	lbz	r10,64*5($inp)
2349	lbz	r11,64*6($inp)
2350	lbz	r12,64*7($inp)
2351	addi	$inp,$inp,64*8
2352
2353	sldi	r6, r6, 8
2354	sldi	r7, r7, 16
2355	sldi	r8, r8, 24
2356	sldi	r9, r9, 32
2357	sldi	r10,r10,40
2358	sldi	r11,r11,48
2359	sldi	r12,r12,56
2360
2361	or	r5,r5,r6
2362	or	r7,r7,r8
2363	or	r9,r9,r10
2364	or	r11,r11,r12
2365	or	r5,r5,r7
2366	or	r9,r9,r11
2367	or	r5,r5,r9
2368	and	r5,r5,r0
2369	stdu	r5,8($out)
2370	bdnz	.Loop_gather_w7
2371
2372	blr
2373	.long	0
2374	.byte	0,12,0x14,0,0,0,3,0
2375	.long	0
2376.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
2377___
2378}
2379
2380foreach (split("\n",$code)) {
2381	s/\`([^\`]*)\`/eval $1/ge;
2382
2383	print $_,"\n";
2384}
2385close STDOUT or die "error closing STDOUT: $!";	# enforce flush
2386