xref: /openssl/crypto/ec/asm/x25519-ppc64.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# X25519 lower-level primitives for PPC64.
17#
18# July 2018.
19#
20# Base 2^64 is faster than base 2^51 on pre-POWER8, most notably ~15%
21# faster on PPC970/G5. POWER8 on the other hand seems to trip on own
22# shoelaces when handling longer carry chains. As base 2^51 has just
23# single-carry pairs, it's 25% faster than base 2^64. Since PPC970 is
24# pretty old, base 2^64 implementation is not engaged. Comparison to
25# compiler-generated code is complicated by the fact that not all
26# compilers support 128-bit integers. When compiler doesn't, like xlc,
27# this module delivers more than 2x improvement, and when it does,
28# from 12% to 30% improvement was measured...
29
30# $output is the last argument if it looks like a file (it has an extension)
31# $flavour is the first argument if it doesn't look like a file
32$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
33$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
34
35$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
37( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
38die "can't locate ppc-xlate.pl";
39
40open OUT,"| \"$^X\" $xlate $flavour \"$output\""
41    or die "can't call $xlate: $!";
42*STDOUT=*OUT;
43
44my $sp = "r1";
45my ($rp,$ap,$bp) = map("r$_",3..5);
46
47####################################################### base 2^64
48if (0) {
49my ($bi,$a0,$a1,$a2,$a3,$t0,$t1, $t2,$t3,
50    $acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) =
51    map("r$_",(6..12,22..31));
52my $zero = "r0";
53my $FRAME = 16*8;
54
55$code.=<<___;
56.text
57
58.globl	x25519_fe64_mul
59.type	x25519_fe64_mul,\@function
60.align	5
61x25519_fe64_mul:
62	stdu	$sp,-$FRAME($sp)
63	std	r22,`$FRAME-8*10`($sp)
64	std	r23,`$FRAME-8*9`($sp)
65	std	r24,`$FRAME-8*8`($sp)
66	std	r25,`$FRAME-8*7`($sp)
67	std	r26,`$FRAME-8*6`($sp)
68	std	r27,`$FRAME-8*5`($sp)
69	std	r28,`$FRAME-8*4`($sp)
70	std	r29,`$FRAME-8*3`($sp)
71	std	r30,`$FRAME-8*2`($sp)
72	std	r31,`$FRAME-8*1`($sp)
73
74	ld	$bi,0($bp)
75	ld	$a0,0($ap)
76	xor	$zero,$zero,$zero
77	ld	$a1,8($ap)
78	ld	$a2,16($ap)
79	ld	$a3,24($ap)
80
81	mulld	$acc0,$a0,$bi		# a[0]*b[0]
82	mulhdu	$t0,$a0,$bi
83	mulld	$acc1,$a1,$bi		# a[1]*b[0]
84	mulhdu	$t1,$a1,$bi
85	mulld	$acc2,$a2,$bi		# a[2]*b[0]
86	mulhdu	$t2,$a2,$bi
87	mulld	$acc3,$a3,$bi		# a[3]*b[0]
88	mulhdu	$t3,$a3,$bi
89___
90for(my @acc=($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7),
91    my $i=1; $i<4; shift(@acc), $i++) {
92my $acc4 = $i==1? $zero : @acc[4];
93
94$code.=<<___;
95	ld	$bi,`8*$i`($bp)
96	addc	@acc[1],@acc[1],$t0	# accumulate high parts
97	mulld	$t0,$a0,$bi
98	adde	@acc[2],@acc[2],$t1
99	mulld	$t1,$a1,$bi
100	adde	@acc[3],@acc[3],$t2
101	mulld	$t2,$a2,$bi
102	adde	@acc[4],$acc4,$t3
103	mulld	$t3,$a3,$bi
104	addc	@acc[1],@acc[1],$t0	# accumulate low parts
105	mulhdu	$t0,$a0,$bi
106	adde	@acc[2],@acc[2],$t1
107	mulhdu	$t1,$a1,$bi
108	adde	@acc[3],@acc[3],$t2
109	mulhdu	$t2,$a2,$bi
110	adde	@acc[4],@acc[4],$t3
111	mulhdu	$t3,$a3,$bi
112	adde	@acc[5],$zero,$zero
113___
114}
115$code.=<<___;
116	li	$bi,38
117	addc	$acc4,$acc4,$t0
118	mulld	$t0,$acc4,$bi
119	adde	$acc5,$acc5,$t1
120	mulld	$t1,$acc5,$bi
121	adde	$acc6,$acc6,$t2
122	mulld	$t2,$acc6,$bi
123	adde	$acc7,$acc7,$t3
124	mulld	$t3,$acc7,$bi
125
126	addc	$acc0,$acc0,$t0
127	mulhdu	$t0,$acc4,$bi
128	adde	$acc1,$acc1,$t1
129	mulhdu	$t1,$acc5,$bi
130	adde	$acc2,$acc2,$t2
131	mulhdu	$t2,$acc6,$bi
132	adde	$acc3,$acc3,$t3
133	mulhdu	$t3,$acc7,$bi
134	adde	$acc4,$zero,$zero
135
136	addc	$acc1,$acc1,$t0
137	adde	$acc2,$acc2,$t1
138	adde	$acc3,$acc3,$t2
139	adde	$acc4,$acc4,$t3
140
141	mulld	$acc4,$acc4,$bi
142
143	addc	$acc0,$acc0,$acc4
144	addze	$acc1,$acc1
145	addze	$acc2,$acc2
146	addze	$acc3,$acc3
147
148	subfe	$acc4,$acc4,$acc4	# carry -> ~mask
149	std	$acc1,8($rp)
150	andc	$acc4,$bi,$acc4
151	std	$acc2,16($rp)
152	add	$acc0,$acc0,$acc4
153	std	$acc3,24($rp)
154	std	$acc0,0($rp)
155
156	ld	r22,`$FRAME-8*10`($sp)
157	ld	r23,`$FRAME-8*9`($sp)
158	ld	r24,`$FRAME-8*8`($sp)
159	ld	r25,`$FRAME-8*7`($sp)
160	ld	r26,`$FRAME-8*6`($sp)
161	ld	r27,`$FRAME-8*5`($sp)
162	ld	r28,`$FRAME-8*4`($sp)
163	ld	r29,`$FRAME-8*3`($sp)
164	ld	r30,`$FRAME-8*2`($sp)
165	ld	r31,`$FRAME-8*1`($sp)
166	addi	$sp,$sp,$FRAME
167	blr
168	.long	0
169	.byte	0,12,4,0,0x80,10,3,0
170	.long	0
171.size	x25519_fe64_mul,.-x25519_fe64_mul
172
173.globl	x25519_fe64_sqr
174.type	x25519_fe64_sqr,\@function
175.align	5
176x25519_fe64_sqr:
177	stdu	$sp,-$FRAME($sp)
178	std	r22,`$FRAME-8*10`($sp)
179	std	r23,`$FRAME-8*9`($sp)
180	std	r24,`$FRAME-8*8`($sp)
181	std	r25,`$FRAME-8*7`($sp)
182	std	r26,`$FRAME-8*6`($sp)
183	std	r27,`$FRAME-8*5`($sp)
184	std	r28,`$FRAME-8*4`($sp)
185	std	r29,`$FRAME-8*3`($sp)
186	std	r30,`$FRAME-8*2`($sp)
187	std	r31,`$FRAME-8*1`($sp)
188
189	ld	$a0,0($ap)
190	xor	$zero,$zero,$zero
191	ld	$a1,8($ap)
192	ld	$a2,16($ap)
193	ld	$a3,24($ap)
194
195	################################
196	#  |  |  |  |  |  |a1*a0|  |
197	#  |  |  |  |  |a2*a0|  |  |
198	#  |  |a3*a2|a3*a0|  |  |  |
199	#  |  |  |  |a2*a1|  |  |  |
200	#  |  |  |a3*a1|  |  |  |  |
201	# *|  |  |  |  |  |  |  | 2|
202	# +|a3*a3|a2*a2|a1*a1|a0*a0|
203	#  |--+--+--+--+--+--+--+--|
204	#  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
205	#
206	#  "can't overflow" below mark carrying into high part of
207	#  multiplication result, which can't overflow, because it
208	#  can never be all ones.
209
210	mulld	$acc1,$a1,$a0		# a[1]*a[0]
211	mulhdu	$t1,$a1,$a0
212	mulld	$acc2,$a2,$a0		# a[2]*a[0]
213	mulhdu	$t2,$a2,$a0
214	mulld	$acc3,$a3,$a0		# a[3]*a[0]
215	mulhdu	$acc4,$a3,$a0
216
217	addc	$acc2,$acc2,$t1		# accumulate high parts of multiplication
218	 mulld	$t0,$a2,$a1		# a[2]*a[1]
219	 mulhdu	$t1,$a2,$a1
220	adde	$acc3,$acc3,$t2
221	 mulld	$t2,$a3,$a1		# a[3]*a[1]
222	 mulhdu	$t3,$a3,$a1
223	addze	$acc4,$acc4		# can't overflow
224
225	mulld	$acc5,$a3,$a2		# a[3]*a[2]
226	mulhdu	$acc6,$a3,$a2
227
228	addc	$t1,$t1,$t2		# accumulate high parts of multiplication
229	 mulld	$acc0,$a0,$a0		# a[0]*a[0]
230	addze	$t2,$t3			# can't overflow
231
232	addc	$acc3,$acc3,$t0		# accumulate low parts of multiplication
233	 mulhdu	$a0,$a0,$a0
234	adde	$acc4,$acc4,$t1
235	 mulld	$t1,$a1,$a1		# a[1]*a[1]
236	adde	$acc5,$acc5,$t2
237	 mulhdu	$a1,$a1,$a1
238	addze	$acc6,$acc6		# can't overflow
239
240	addc	$acc1,$acc1,$acc1	# acc[1-6]*=2
241	 mulld	$t2,$a2,$a2		# a[2]*a[2]
242	adde	$acc2,$acc2,$acc2
243	 mulhdu	$a2,$a2,$a2
244	adde	$acc3,$acc3,$acc3
245	 mulld	$t3,$a3,$a3		# a[3]*a[3]
246	adde	$acc4,$acc4,$acc4
247	 mulhdu	$a3,$a3,$a3
248	adde	$acc5,$acc5,$acc5
249	adde	$acc6,$acc6,$acc6
250	addze	$acc7,$zero
251
252	addc	$acc1,$acc1,$a0		# +a[i]*a[i]
253	 li	$bi,38
254	adde	$acc2,$acc2,$t1
255	adde	$acc3,$acc3,$a1
256	adde	$acc4,$acc4,$t2
257	adde	$acc5,$acc5,$a2
258	adde	$acc6,$acc6,$t3
259	adde	$acc7,$acc7,$a3
260
261	mulld	$t0,$acc4,$bi
262	mulld	$t1,$acc5,$bi
263	mulld	$t2,$acc6,$bi
264	mulld	$t3,$acc7,$bi
265
266	addc	$acc0,$acc0,$t0
267	mulhdu	$t0,$acc4,$bi
268	adde	$acc1,$acc1,$t1
269	mulhdu	$t1,$acc5,$bi
270	adde	$acc2,$acc2,$t2
271	mulhdu	$t2,$acc6,$bi
272	adde	$acc3,$acc3,$t3
273	mulhdu	$t3,$acc7,$bi
274	addze	$acc4,$zero
275
276	addc	$acc1,$acc1,$t0
277	adde	$acc2,$acc2,$t1
278	adde	$acc3,$acc3,$t2
279	adde	$acc4,$acc4,$t3
280
281	mulld	$acc4,$acc4,$bi
282
283	addc	$acc0,$acc0,$acc4
284	addze	$acc1,$acc1
285	addze	$acc2,$acc2
286	addze	$acc3,$acc3
287
288	subfe	$acc4,$acc4,$acc4	# carry -> ~mask
289	std	$acc1,8($rp)
290	andc	$acc4,$bi,$acc4
291	std	$acc2,16($rp)
292	add	$acc0,$acc0,$acc4
293	std	$acc3,24($rp)
294	std	$acc0,0($rp)
295
296	ld	r22,`$FRAME-8*10`($sp)
297	ld	r23,`$FRAME-8*9`($sp)
298	ld	r24,`$FRAME-8*8`($sp)
299	ld	r25,`$FRAME-8*7`($sp)
300	ld	r26,`$FRAME-8*6`($sp)
301	ld	r27,`$FRAME-8*5`($sp)
302	ld	r28,`$FRAME-8*4`($sp)
303	ld	r29,`$FRAME-8*3`($sp)
304	ld	r30,`$FRAME-8*2`($sp)
305	ld	r31,`$FRAME-8*1`($sp)
306	addi	$sp,$sp,$FRAME
307	blr
308	.long	0
309	.byte	0,12,4,0,0x80,10,2,0
310	.long	0
311.size	x25519_fe64_sqr,.-x25519_fe64_sqr
312
313.globl	x25519_fe64_mul121666
314.type	x25519_fe64_mul121666,\@function
315.align	5
316x25519_fe64_mul121666:
317	lis	$bi,`65536>>16`
318	ori	$bi,$bi,`121666-65536`
319
320	ld	$t0,0($ap)
321	ld	$t1,8($ap)
322	ld	$bp,16($ap)
323	ld	$ap,24($ap)
324
325	mulld	$a0,$t0,$bi
326	mulhdu	$t0,$t0,$bi
327	mulld	$a1,$t1,$bi
328	mulhdu	$t1,$t1,$bi
329	mulld	$a2,$bp,$bi
330	mulhdu	$bp,$bp,$bi
331	mulld	$a3,$ap,$bi
332	mulhdu	$ap,$ap,$bi
333
334	addc	$a1,$a1,$t0
335	adde	$a2,$a2,$t1
336	adde	$a3,$a3,$bp
337	addze	$ap,    $ap
338
339	mulli	$ap,$ap,38
340
341	addc	$a0,$a0,$ap
342	addze	$a1,$a1
343	addze	$a2,$a2
344	addze	$a3,$a3
345
346	subfe	$t1,$t1,$t1		# carry -> ~mask
347	std	$a1,8($rp)
348	andc	$t0,$t0,$t1
349	std	$a2,16($rp)
350	add	$a0,$a0,$t0
351	std	$a3,24($rp)
352	std	$a0,0($rp)
353
354	blr
355	.long	0
356	.byte	0,12,0x14,0,0,0,2,0
357	.long	0
358.size	x25519_fe64_mul121666,.-x25519_fe64_mul121666
359
360.globl	x25519_fe64_add
361.type	x25519_fe64_add,\@function
362.align	5
363x25519_fe64_add:
364	ld	$a0,0($ap)
365	ld	$t0,0($bp)
366	ld	$a1,8($ap)
367	ld	$t1,8($bp)
368	ld	$a2,16($ap)
369	ld	$bi,16($bp)
370	ld	$a3,24($ap)
371	ld	$bp,24($bp)
372
373	addc	$a0,$a0,$t0
374	adde	$a1,$a1,$t1
375	adde	$a2,$a2,$bi
376	adde	$a3,$a3,$bp
377
378	li	$t0,38
379	subfe	$t1,$t1,$t1		# carry -> ~mask
380	andc	$t1,$t0,$t1
381
382	addc	$a0,$a0,$t1
383	addze	$a1,$a1
384	addze	$a2,$a2
385	addze	$a3,$a3
386
387	subfe	$t1,$t1,$t1		# carry -> ~mask
388	std	$a1,8($rp)
389	andc	$t0,$t0,$t1
390	std	$a2,16($rp)
391	add	$a0,$a0,$t0
392	std	$a3,24($rp)
393	std	$a0,0($rp)
394
395	blr
396	.long	0
397	.byte	0,12,0x14,0,0,0,3,0
398	.long	0
399.size	x25519_fe64_add,.-x25519_fe64_add
400
401.globl	x25519_fe64_sub
402.type	x25519_fe64_sub,\@function
403.align	5
404x25519_fe64_sub:
405	ld	$a0,0($ap)
406	ld	$t0,0($bp)
407	ld	$a1,8($ap)
408	ld	$t1,8($bp)
409	ld	$a2,16($ap)
410	ld	$bi,16($bp)
411	ld	$a3,24($ap)
412	ld	$bp,24($bp)
413
414	subfc	$a0,$t0,$a0
415	subfe	$a1,$t1,$a1
416	subfe	$a2,$bi,$a2
417	subfe	$a3,$bp,$a3
418
419	li	$t0,38
420	subfe	$t1,$t1,$t1		# borrow -> mask
421	xor	$zero,$zero,$zero
422	and	$t1,$t0,$t1
423
424	subfc	$a0,$t1,$a0
425	subfe	$a1,$zero,$a1
426	subfe	$a2,$zero,$a2
427	subfe	$a3,$zero,$a3
428
429	subfe	$t1,$t1,$t1		# borrow -> mask
430	std	$a1,8($rp)
431	and	$t0,$t0,$t1
432	std	$a2,16($rp)
433	subf	$a0,$t0,$a0
434	std	$a3,24($rp)
435	std	$a0,0($rp)
436
437	blr
438	.long	0
439	.byte	0,12,0x14,0,0,0,3,0
440	.long	0
441.size	x25519_fe64_sub,.-x25519_fe64_sub
442
443.globl	x25519_fe64_tobytes
444.type	x25519_fe64_tobytes,\@function
445.align	5
446x25519_fe64_tobytes:
447	ld	$a3,24($ap)
448	ld	$a0,0($ap)
449	ld	$a1,8($ap)
450	ld	$a2,16($ap)
451
452	sradi	$t0,$a3,63		# most significant bit -> mask
453	li	$t1,19
454	and	$t0,$t0,$t1
455	sldi	$a3,$a3,1
456	add	$t0,$t0,$t1		# compare to modulus in the same go
457	srdi	$a3,$a3,1		# most significant bit cleared
458
459	addc	$a0,$a0,$t0
460	addze	$a1,$a1
461	addze	$a2,$a2
462	addze	$a3,$a3
463
464	xor	$zero,$zero,$zero
465	sradi	$t0,$a3,63		# most significant bit -> mask
466	sldi	$a3,$a3,1
467	andc	$t0,$t1,$t0
468	srdi	$a3,$a3,1		# most significant bit cleared
469
470	subi	$rp,$rp,1
471	subfc	$a0,$t0,$a0
472	subfe	$a1,$zero,$a1
473	subfe	$a2,$zero,$a2
474	subfe	$a3,$zero,$a3
475
476___
477for (my @a=($a0,$a1,$a2,$a3), my $i=0; $i<4; shift(@a), $i++) {
478$code.=<<___;
479	srdi	$t0,@a[0],8
480	stbu	@a[0],1($rp)
481	srdi	@a[0],@a[0],16
482	stbu	$t0,1($rp)
483	srdi	$t0,@a[0],8
484	stbu	@a[0],1($rp)
485	srdi	@a[0],@a[0],16
486	stbu	$t0,1($rp)
487	srdi	$t0,@a[0],8
488	stbu	@a[0],1($rp)
489	srdi	@a[0],@a[0],16
490	stbu	$t0,1($rp)
491	srdi	$t0,@a[0],8
492	stbu	@a[0],1($rp)
493	stbu	$t0,1($rp)
494___
495}
496$code.=<<___;
497	blr
498	.long	0
499	.byte	0,12,0x14,0,0,0,2,0
500	.long	0
501.size	x25519_fe64_tobytes,.-x25519_fe64_tobytes
502___
503}
504####################################################### base 2^51
505{
506my ($bi,$a0,$a1,$a2,$a3,$a4,$t0, $t1,
507    $h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,$h4lo,$h4hi) =
508    map("r$_",(6..12,21..31));
509my $mask = "r0";
510my $FRAME = 18*8;
511
512$code.=<<___;
513.text
514
515.globl	x25519_fe51_mul
516.type	x25519_fe51_mul,\@function
517.align	5
518x25519_fe51_mul:
519	stdu	$sp,-$FRAME($sp)
520	std	r21,`$FRAME-8*11`($sp)
521	std	r22,`$FRAME-8*10`($sp)
522	std	r23,`$FRAME-8*9`($sp)
523	std	r24,`$FRAME-8*8`($sp)
524	std	r25,`$FRAME-8*7`($sp)
525	std	r26,`$FRAME-8*6`($sp)
526	std	r27,`$FRAME-8*5`($sp)
527	std	r28,`$FRAME-8*4`($sp)
528	std	r29,`$FRAME-8*3`($sp)
529	std	r30,`$FRAME-8*2`($sp)
530	std	r31,`$FRAME-8*1`($sp)
531
532	ld	$bi,0($bp)
533	ld	$a0,0($ap)
534	ld	$a1,8($ap)
535	ld	$a2,16($ap)
536	ld	$a3,24($ap)
537	ld	$a4,32($ap)
538
539	mulld	$h0lo,$a0,$bi		# a[0]*b[0]
540	mulhdu	$h0hi,$a0,$bi
541
542	mulld	$h1lo,$a1,$bi		# a[1]*b[0]
543	mulhdu	$h1hi,$a1,$bi
544
545	 mulld	$h4lo,$a4,$bi		# a[4]*b[0]
546	 mulhdu	$h4hi,$a4,$bi
547	 ld	$ap,8($bp)
548	 mulli	$a4,$a4,19
549
550	mulld	$h2lo,$a2,$bi		# a[2]*b[0]
551	mulhdu	$h2hi,$a2,$bi
552
553	mulld	$h3lo,$a3,$bi		# a[3]*b[0]
554	mulhdu	$h3hi,$a3,$bi
555___
556for(my @a=($a0,$a1,$a2,$a3,$a4),
557    my $i=1; $i<4; $i++) {
558	($ap,$bi) = ($bi,$ap);
559$code.=<<___;
560	mulld	$t0,@a[4],$bi
561	mulhdu	$t1,@a[4],$bi
562	addc	$h0lo,$h0lo,$t0
563	adde	$h0hi,$h0hi,$t1
564
565	mulld	$t0,@a[0],$bi
566	mulhdu	$t1,@a[0],$bi
567	addc	$h1lo,$h1lo,$t0
568	adde	$h1hi,$h1hi,$t1
569
570	 mulld	$t0,@a[3],$bi
571	 mulhdu	$t1,@a[3],$bi
572	 ld	$ap,`8*($i+1)`($bp)
573	 mulli	@a[3],@a[3],19
574	 addc	$h4lo,$h4lo,$t0
575	 adde	$h4hi,$h4hi,$t1
576
577	mulld	$t0,@a[1],$bi
578	mulhdu	$t1,@a[1],$bi
579	addc	$h2lo,$h2lo,$t0
580	adde	$h2hi,$h2hi,$t1
581
582	mulld	$t0,@a[2],$bi
583	mulhdu	$t1,@a[2],$bi
584	addc	$h3lo,$h3lo,$t0
585	adde	$h3hi,$h3hi,$t1
586___
587	unshift(@a,pop(@a));
588}
589	($ap,$bi) = ($bi,$ap);
590$code.=<<___;
591	mulld	$t0,$a1,$bi
592	mulhdu	$t1,$a1,$bi
593	addc	$h0lo,$h0lo,$t0
594	adde	$h0hi,$h0hi,$t1
595
596	mulld	$t0,$a2,$bi
597	mulhdu	$t1,$a2,$bi
598	addc	$h1lo,$h1lo,$t0
599	adde	$h1hi,$h1hi,$t1
600
601	mulld	$t0,$a3,$bi
602	mulhdu	$t1,$a3,$bi
603	addc	$h2lo,$h2lo,$t0
604	adde	$h2hi,$h2hi,$t1
605
606	mulld	$t0,$a4,$bi
607	mulhdu	$t1,$a4,$bi
608	addc	$h3lo,$h3lo,$t0
609	adde	$h3hi,$h3hi,$t1
610
611	mulld	$t0,$a0,$bi
612	mulhdu	$t1,$a0,$bi
613	addc	$h4lo,$h4lo,$t0
614	adde	$h4hi,$h4hi,$t1
615
616.Lfe51_reduce:
617	li	$mask,-1
618	srdi	$mask,$mask,13		# 0x7ffffffffffff
619
620	srdi	$t0,$h2lo,51
621	and	$a2,$h2lo,$mask
622	insrdi	$t0,$h2hi,51,0		# h2>>51
623	 srdi	$t1,$h0lo,51
624	 and	$a0,$h0lo,$mask
625	 insrdi	$t1,$h0hi,51,0		# h0>>51
626	addc	$h3lo,$h3lo,$t0
627	addze	$h3hi,$h3hi
628	 addc	$h1lo,$h1lo,$t1
629	 addze	$h1hi,$h1hi
630
631	srdi	$t0,$h3lo,51
632	and	$a3,$h3lo,$mask
633	insrdi	$t0,$h3hi,51,0		# h3>>51
634	 srdi	$t1,$h1lo,51
635	 and	$a1,$h1lo,$mask
636	 insrdi	$t1,$h1hi,51,0		# h1>>51
637	addc	$h4lo,$h4lo,$t0
638	addze	$h4hi,$h4hi
639	 add	$a2,$a2,$t1
640
641	srdi	$t0,$h4lo,51
642	and	$a4,$h4lo,$mask
643	insrdi	$t0,$h4hi,51,0
644	mulli	$t0,$t0,19		# (h4 >> 51) * 19
645
646	add	$a0,$a0,$t0
647
648	srdi	$t1,$a2,51
649	and	$a2,$a2,$mask
650	add	$a3,$a3,$t1
651
652	srdi	$t0,$a0,51
653	and	$a0,$a0,$mask
654	add	$a1,$a1,$t0
655
656	std	$a2,16($rp)
657	std	$a3,24($rp)
658	std	$a4,32($rp)
659	std	$a0,0($rp)
660	std	$a1,8($rp)
661
662	ld	r21,`$FRAME-8*11`($sp)
663	ld	r22,`$FRAME-8*10`($sp)
664	ld	r23,`$FRAME-8*9`($sp)
665	ld	r24,`$FRAME-8*8`($sp)
666	ld	r25,`$FRAME-8*7`($sp)
667	ld	r26,`$FRAME-8*6`($sp)
668	ld	r27,`$FRAME-8*5`($sp)
669	ld	r28,`$FRAME-8*4`($sp)
670	ld	r29,`$FRAME-8*3`($sp)
671	ld	r30,`$FRAME-8*2`($sp)
672	ld	r31,`$FRAME-8*1`($sp)
673	addi	$sp,$sp,$FRAME
674	blr
675	.long	0
676	.byte	0,12,4,0,0x80,11,3,0
677	.long	0
678.size	x25519_fe51_mul,.-x25519_fe51_mul
679___
680{
681my ($a0,$a1,$a2,$a3,$a4,$t0,$t1) = ($a0,$a1,$a2,$a3,$a4,$t0,$t1);
682$code.=<<___;
683.globl	x25519_fe51_sqr
684.type	x25519_fe51_sqr,\@function
685.align	5
686x25519_fe51_sqr:
687	stdu	$sp,-$FRAME($sp)
688	std	r21,`$FRAME-8*11`($sp)
689	std	r22,`$FRAME-8*10`($sp)
690	std	r23,`$FRAME-8*9`($sp)
691	std	r24,`$FRAME-8*8`($sp)
692	std	r25,`$FRAME-8*7`($sp)
693	std	r26,`$FRAME-8*6`($sp)
694	std	r27,`$FRAME-8*5`($sp)
695	std	r28,`$FRAME-8*4`($sp)
696	std	r29,`$FRAME-8*3`($sp)
697	std	r30,`$FRAME-8*2`($sp)
698	std	r31,`$FRAME-8*1`($sp)
699
700	ld	$a0,0($ap)
701	ld	$a1,8($ap)
702	ld	$a2,16($ap)
703	ld	$a3,24($ap)
704	ld	$a4,32($ap)
705
706	add	$bi,$a0,$a0		# a[0]*2
707	mulli	$t1,$a4,19		# a[4]*19
708
709	mulld	$h0lo,$a0,$a0
710	mulhdu	$h0hi,$a0,$a0
711	mulld	$h1lo,$a1,$bi
712	mulhdu	$h1hi,$a1,$bi
713	mulld	$h2lo,$a2,$bi
714	mulhdu	$h2hi,$a2,$bi
715	mulld	$h3lo,$a3,$bi
716	mulhdu	$h3hi,$a3,$bi
717	mulld	$h4lo,$a4,$bi
718	mulhdu	$h4hi,$a4,$bi
719	add	$bi,$a1,$a1		# a[1]*2
720___
721	($a4,$t1) = ($t1,$a4);
722$code.=<<___;
723	mulld	$t0,$t1,$a4
724	mulhdu	$t1,$t1,$a4
725	addc	$h3lo,$h3lo,$t0
726	adde	$h3hi,$h3hi,$t1
727
728	mulli	$bp,$a3,19		# a[3]*19
729
730	mulld	$t0,$a1,$a1
731	mulhdu	$t1,$a1,$a1
732	addc	$h2lo,$h2lo,$t0
733	adde	$h2hi,$h2hi,$t1
734	mulld	$t0,$a2,$bi
735	mulhdu	$t1,$a2,$bi
736	addc	$h3lo,$h3lo,$t0
737	adde	$h3hi,$h3hi,$t1
738	mulld	$t0,$a3,$bi
739	mulhdu	$t1,$a3,$bi
740	addc	$h4lo,$h4lo,$t0
741	adde	$h4hi,$h4hi,$t1
742	mulld	$t0,$a4,$bi
743	mulhdu	$t1,$a4,$bi
744	add	$bi,$a3,$a3		# a[3]*2
745	addc	$h0lo,$h0lo,$t0
746	adde	$h0hi,$h0hi,$t1
747___
748	($a3,$t1) = ($bp,$a3);
749$code.=<<___;
750	mulld	$t0,$t1,$a3
751	mulhdu	$t1,$t1,$a3
752	addc	$h1lo,$h1lo,$t0
753	adde	$h1hi,$h1hi,$t1
754	mulld	$t0,$bi,$a4
755	mulhdu	$t1,$bi,$a4
756	add	$bi,$a2,$a2		# a[2]*2
757	addc	$h2lo,$h2lo,$t0
758	adde	$h2hi,$h2hi,$t1
759
760	mulld	$t0,$a2,$a2
761	mulhdu	$t1,$a2,$a2
762	addc	$h4lo,$h4lo,$t0
763	adde	$h4hi,$h4hi,$t1
764	mulld	$t0,$a3,$bi
765	mulhdu	$t1,$a3,$bi
766	addc	$h0lo,$h0lo,$t0
767	adde	$h0hi,$h0hi,$t1
768	mulld	$t0,$a4,$bi
769	mulhdu	$t1,$a4,$bi
770	addc	$h1lo,$h1lo,$t0
771	adde	$h1hi,$h1hi,$t1
772
773	b	.Lfe51_reduce
774	.long	0
775	.byte	0,12,4,0,0x80,11,2,0
776	.long	0
777.size	x25519_fe51_sqr,.-x25519_fe51_sqr
778___
779}
780$code.=<<___;
781.globl	x25519_fe51_mul121666
782.type	x25519_fe51_mul121666,\@function
783.align	5
784x25519_fe51_mul121666:
785	stdu	$sp,-$FRAME($sp)
786	std	r21,`$FRAME-8*11`($sp)
787	std	r22,`$FRAME-8*10`($sp)
788	std	r23,`$FRAME-8*9`($sp)
789	std	r24,`$FRAME-8*8`($sp)
790	std	r25,`$FRAME-8*7`($sp)
791	std	r26,`$FRAME-8*6`($sp)
792	std	r27,`$FRAME-8*5`($sp)
793	std	r28,`$FRAME-8*4`($sp)
794	std	r29,`$FRAME-8*3`($sp)
795	std	r30,`$FRAME-8*2`($sp)
796	std	r31,`$FRAME-8*1`($sp)
797
798	lis	$bi,`65536>>16`
799	ori	$bi,$bi,`121666-65536`
800	ld	$a0,0($ap)
801	ld	$a1,8($ap)
802	ld	$a2,16($ap)
803	ld	$a3,24($ap)
804	ld	$a4,32($ap)
805
806	mulld	$h0lo,$a0,$bi		# a[0]*121666
807	mulhdu	$h0hi,$a0,$bi
808	mulld	$h1lo,$a1,$bi		# a[1]*121666
809	mulhdu	$h1hi,$a1,$bi
810	mulld	$h2lo,$a2,$bi		# a[2]*121666
811	mulhdu	$h2hi,$a2,$bi
812	mulld	$h3lo,$a3,$bi		# a[3]*121666
813	mulhdu	$h3hi,$a3,$bi
814	mulld	$h4lo,$a4,$bi		# a[4]*121666
815	mulhdu	$h4hi,$a4,$bi
816
817	b	.Lfe51_reduce
818	.long	0
819	.byte	0,12,4,0,0x80,11,2,0
820	.long	0
821.size	x25519_fe51_mul121666,.-x25519_fe51_mul121666
822___
823}
824
825$code =~ s/\`([^\`]*)\`/eval $1/gem;
826print $code;
827close STDOUT or die "error closing STDOUT: $!";
828