xref: /openssl/crypto/bn/asm/ppc64-mont-fixed.pl (revision 2f1112b2)
1#! /usr/bin/env perl
2# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9# ====================================================================
10# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
11# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
12# the OpenSSL project.
13# ====================================================================
14
15#
16# Fixed length (n=6), unrolled PPC Montgomery Multiplication
17#
18
19# 2021
20#
21# Although this is a generic implementation for unrolling Montgomery
22# Multiplication for arbitrary values of n, this is currently only
23# used for n = 6 to improve the performance of ECC p384.
24#
25# Unrolling allows intermediate results to be stored in registers,
26# rather than on the stack, improving performance by ~7% compared to
27# the existing PPC assembly code.
28#
29# The ISA 3.0 implementation uses combination multiply/add
30# instructions (maddld, maddhdu) to improve performance by an
31# additional ~10% on Power 9.
32#
33# Finally, saving non-volatile registers into volatile vector
34# registers instead of onto the stack saves a little more.
35#
36# On a Power 9 machine we see an overall improvement of ~18%.
37#
38
39use strict;
40use warnings;
41
42my ($flavour, $output, $dir, $xlate);
43
44# $output is the last argument if it looks like a file (it has an extension)
45# $flavour is the first argument if it doesn't look like a file
46$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
47$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
52die "can't locate ppc-xlate.pl";
53
54open STDOUT,"| $^X $xlate $flavour \"$output\""
55    or die "can't call $xlate: $!";
56
57if ($flavour !~ /64/) {
58	die "bad flavour ($flavour) - only ppc64 permitted";
59}
60
61my $SIZE_T= 8;
62
63# Registers are global so the code is remotely readable
64
65# Parameters for Montgomery multiplication
66my $ze	= "r0";
67my $sp	= "r1";
68my $toc	= "r2";
69my $rp	= "r3";
70my $ap	= "r4";
71my $bp	= "r5";
72my $np	= "r6";
73my $n0	= "r7";
74my $num	= "r8";
75
76my $i	= "r9";
77my $c0	= "r10";
78my $bp0	= "r11";
79my $bpi	= "r11";
80my $bpj	= "r11";
81my $tj	= "r12";
82my $apj	= "r12";
83my $npj	= "r12";
84my $lo	= "r14";
85my $c1	= "r14";
86
87# Non-volatile registers used for tp[i]
88#
89# 12 registers are available but the limit on unrolling is 10,
90# since registers from $tp[0] to $tp[$n+1] are used.
91my @tp = ("r20" .. "r31");
92
93# volatile VSRs for saving non-volatile GPRs - faster than stack
94my @vsrs = ("v32" .. "v46");
95
96package Mont;
97
98sub new($$)
99{
100	my ($class, $n) = @_;
101
102	if ($n > 10) {
103		die "Can't unroll for BN length ${n} (maximum 10)"
104	}
105
106	my $self = {
107		code => "",
108		n => $n,
109	};
110	bless $self, $class;
111
112	return $self;
113}
114
115sub add_code($$)
116{
117	my ($self, $c) = @_;
118
119	$self->{code} .= $c;
120}
121
122sub get_code($)
123{
124	my ($self) = @_;
125
126	return $self->{code};
127}
128
129sub get_function_name($)
130{
131	my ($self) = @_;
132
133	return "bn_mul_mont_fixed_n" . $self->{n};
134}
135
136sub get_label($$)
137{
138	my ($self, $l) = @_;
139
140	return "L" . $l . "_" . $self->{n};
141}
142
143sub get_labels($@)
144{
145	my ($self, @labels) = @_;
146
147	my %out = ();
148
149	foreach my $l (@labels) {
150		$out{"$l"} = $self->get_label("$l");
151	}
152
153	return \%out;
154}
155
156sub nl($)
157{
158	my ($self) = @_;
159
160	$self->add_code("\n");
161}
162
163sub copy_result($)
164{
165	my ($self) = @_;
166
167	my ($n) = $self->{n};
168
169	for (my $j = 0; $j < $n; $j++) {
170		$self->add_code(<<___);
171	std		$tp[$j],`$j*$SIZE_T`($rp)
172___
173	}
174
175}
176
177sub mul_mont_fixed($)
178{
179	my ($self) = @_;
180
181	my ($n) = $self->{n};
182	my $fname = $self->get_function_name();
183	my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");
184
185	$self->add_code(<<___);
186
187.globl	.${fname}
188.align	5
189.${fname}:
190
191___
192
193	$self->save_registers();
194
195	$self->add_code(<<___);
196	li		$ze,0
197	ld		$n0,0($n0)
198
199	ld		$bp0,0($bp)
200
201	ld		$apj,0($ap)
202___
203
204	$self->mul_c_0($tp[0], $apj, $bp0, $c0);
205
206	for (my $j = 1; $j < $n - 1; $j++) {
207		$self->add_code(<<___);
208	ld		$apj,`$j*$SIZE_T`($ap)
209___
210		$self->mul($tp[$j], $apj, $bp0, $c0);
211	}
212
213	$self->add_code(<<___);
214	ld		$apj,`($n-1)*$SIZE_T`($ap)
215___
216
217	$self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);
218
219	$self->add_code(<<___);
220	li		$tp[$n+1],0
221
222___
223
224	$self->add_code(<<___);
225	li		$i,0
226	mtctr		$num
227	b		$label->{"enter"}
228
229.align	4
230$label->{"outer"}:
231	ldx		$bpi,$bp,$i
232
233	ld		$apj,0($ap)
234___
235
236	$self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);
237
238	for (my $j = 1; $j < $n; $j++) {
239		$self->add_code(<<___);
240	ld		$apj,`$j*$SIZE_T`($ap)
241___
242		$self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
243	}
244
245	$self->add_code(<<___);
246	addc		$tp[$n],$tp[$n],$c0
247	addze		$tp[$n+1],$ze
248___
249
250	$self->add_code(<<___);
251.align	4
252$label->{"enter"}:
253	mulld		$bpi,$tp[0],$n0
254
255	ld		$npj,0($np)
256___
257
258	$self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);
259
260	for (my $j = 1; $j < $n; $j++) {
261		$self->add_code(<<___);
262	ld		$npj,`$j*$SIZE_T`($np)
263___
264		$self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
265	}
266
267	$self->add_code(<<___);
268	addc		$tp[$n-1],$tp[$n],$c0
269	addze		$tp[$n],$tp[$n+1]
270
271	addi		$i,$i,$SIZE_T
272	bdnz		$label->{"outer"}
273
274	and.		$tp[$n],$tp[$n],$tp[$n]
275	bne		$label->{"sub"}
276
277	cmpld		$tp[$n-1],$npj
278	blt		$label->{"copy"}
279
280$label->{"sub"}:
281___
282
283	#
284	# Reduction
285	#
286
287		$self->add_code(<<___);
288	ld		$bpj,`0*$SIZE_T`($np)
289	subfc		$c1,$bpj,$tp[0]
290	std		$c1,`0*$SIZE_T`($rp)
291
292___
293	for (my $j = 1; $j < $n - 1; $j++) {
294		$self->add_code(<<___);
295	ld		$bpj,`$j*$SIZE_T`($np)
296	subfe		$c1,$bpj,$tp[$j]
297	std		$c1,`$j*$SIZE_T`($rp)
298
299___
300	}
301
302		$self->add_code(<<___);
303	subfe		$c1,$npj,$tp[$n-1]
304	std		$c1,`($n-1)*$SIZE_T`($rp)
305
306___
307
308	$self->add_code(<<___);
309	addme.		$tp[$n],$tp[$n]
310	beq		$label->{"end"}
311
312$label->{"copy"}:
313___
314
315	$self->copy_result();
316
317	$self->add_code(<<___);
318
319$label->{"end"}:
320___
321
322	$self->restore_registers();
323
324	$self->add_code(<<___);
325	li		r3,1
326	blr
327.size .${fname},.-.${fname}
328___
329
330}
331
332package Mont::GPR;
333
334our @ISA = ('Mont');
335
336sub new($$)
337{
338    my ($class, $n) = @_;
339
340    return $class->SUPER::new($n);
341}
342
343sub save_registers($)
344{
345	my ($self) = @_;
346
347	my $n = $self->{n};
348
349	$self->add_code(<<___);
350	std	$lo,-8($sp)
351___
352
353	for (my $j = 0; $j <= $n+1; $j++) {
354		$self->{code}.=<<___;
355	std	$tp[$j],-`($j+2)*8`($sp)
356___
357	}
358
359	$self->add_code(<<___);
360
361___
362}
363
364sub restore_registers($)
365{
366	my ($self) = @_;
367
368	my $n = $self->{n};
369
370	$self->add_code(<<___);
371	ld	$lo,-8($sp)
372___
373
374	for (my $j = 0; $j <= $n+1; $j++) {
375		$self->{code}.=<<___;
376	ld	$tp[$j],-`($j+2)*8`($sp)
377___
378	}
379
380	$self->{code} .=<<___;
381
382___
383}
384
385# Direct translation of C mul()
386sub mul($$$$$)
387{
388	my ($self, $r, $a, $w, $c) = @_;
389
390	$self->add_code(<<___);
391	mulld		$lo,$a,$w
392	addc		$r,$lo,$c
393	mulhdu		$c,$a,$w
394	addze		$c,$c
395
396___
397}
398
399# Like mul() but $c is ignored as an input - an optimisation to save a
400# preliminary instruction that would set input $c to 0
401sub mul_c_0($$$$$)
402{
403	my ($self, $r, $a, $w, $c) = @_;
404
405	$self->add_code(<<___);
406	mulld		$r,$a,$w
407	mulhdu		$c,$a,$w
408
409___
410}
411
412# Like mul() but does not to the final addition of CA into $c - an
413# optimisation to save an instruction
414sub mul_last($$$$$$)
415{
416	my ($self, $r1, $r2, $a, $w, $c) = @_;
417
418	$self->add_code(<<___);
419	mulld		$lo,$a,$w
420	addc		$r1,$lo,$c
421	mulhdu		$c,$a,$w
422
423	addze		$r2,$c
424___
425}
426
427# Like C mul_add() but allow $r_out and $r_in to be different
428sub mul_add($$$$$$)
429{
430	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
431
432	$self->add_code(<<___);
433	mulld		$lo,$a,$w
434	addc		$lo,$lo,$c
435	mulhdu		$c,$a,$w
436	addze		$c,$c
437	addc		$r_out,$r_in,$lo
438	addze		$c,$c
439
440___
441}
442
443# Like mul_add() but $c is ignored as an input - an optimisation to save a
444# preliminary instruction that would set input $c to 0
445sub mul_add_c_0($$$$$$)
446{
447	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
448
449	$self->add_code(<<___);
450	mulld		$lo,$a,$w
451	addc		$r_out,$r_in,$lo
452	mulhdu		$c,$a,$w
453	addze		$c,$c
454
455___
456}
457
458package Mont::GPR_300;
459
460our @ISA = ('Mont::GPR');
461
462sub new($$)
463{
464	my ($class, $n) = @_;
465
466	my $mont = $class->SUPER::new($n);
467
468	return $mont;
469}
470
471sub get_function_name($)
472{
473	my ($self) = @_;
474
475	return "bn_mul_mont_300_fixed_n" . $self->{n};
476}
477
478sub get_label($$)
479{
480	my ($self, $l) = @_;
481
482	return "L" . $l . "_300_" . $self->{n};
483}
484
485# Direct translation of C mul()
486sub mul($$$$$)
487{
488	my ($self, $r, $a, $w, $c, $last) = @_;
489
490	$self->add_code(<<___);
491	maddld		$r,$a,$w,$c
492	maddhdu		$c,$a,$w,$c
493
494___
495}
496
497# Save the last carry as the final entry
498sub mul_last($$$$$)
499{
500	my ($self, $r1, $r2, $a, $w, $c) = @_;
501
502	$self->add_code(<<___);
503	maddld		$r1,$a,$w,$c
504	maddhdu		$r2,$a,$w,$c
505
506___
507}
508
509# Like mul() but $c is ignored as an input - an optimisation to save a
510# preliminary instruction that would set input $c to 0
511sub mul_c_0($$$$$)
512{
513	my ($self, $r, $a, $w, $c) = @_;
514
515	$self->add_code(<<___);
516	mulld          $r,$a,$w
517	mulhdu          $c,$a,$w
518
519___
520}
521
522# Like C mul_add() but allow $r_out and $r_in to be different
523sub mul_add($$$$$$)
524{
525	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
526
527	$self->add_code(<<___);
528	maddld		$lo,$a,$w,$c
529	maddhdu		$c,$a,$w,$c
530	addc		$r_out,$r_in,$lo
531	addze		$c,$c
532
533___
534}
535
536# Like mul_add() but $c is ignored as an input - an optimisation to save a
537# preliminary instruction that would set input $c to 0
538sub mul_add_c_0($$$$$$)
539{
540	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
541
542	$self->add_code(<<___);
543	maddld		$lo,$a,$w,$r_in
544	maddhdu		$c,$a,$w,$r_in
545___
546
547	if ($r_out ne $lo) {
548		$self->add_code(<<___);
549	mr			$r_out,$lo
550___
551	}
552
553	$self->nl();
554}
555
556
557package main;
558
559my $code;
560
561$code.=<<___;
562.machine "any"
563.text
564___
565
566my $mont;
567
568$mont = new Mont::GPR(6);
569$mont->mul_mont_fixed();
570$code .= $mont->get_code();
571
572$mont = new Mont::GPR_300(6);
573$mont->mul_mont_fixed();
574$code .= $mont->get_code();
575
576$code =~ s/\`([^\`]*)\`/eval $1/gem;
577
578$code.=<<___;
579.asciz  "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
580___
581
582print $code;
583close STDOUT or die "error closing STDOUT: $!";
584