xref: /openssl/crypto/bn/asm/ia64-mont.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# January 2010
18#
19# "Teaser" Montgomery multiplication module for IA-64. There are
20# several possibilities for improvement:
21#
22# - modulo-scheduling outer loop would eliminate quite a number of
23#   stalls after ldf8, xma and getf.sig outside inner loop and
24#   improve shorter key performance;
25# - shorter vector support [with input vectors being fetched only
26#   once] should be added;
27# - 2x unroll with help of n0[1] would make the code scalable on
28#   "wider" IA-64, "wider" than Itanium 2 that is, which is not of
29#   acute interest, because upcoming Tukwila's individual cores are
30#   reportedly based on Itanium 2 design;
31# - dedicated squaring procedure(?);
32#
33# January 2010
34#
35# Shorter vector support is implemented by zero-padding ap and np
36# vectors up to 8 elements, or 512 bits. This means that 256-bit
37# inputs will be processed only 2 times faster than 512-bit inputs,
38# not 4 [as one would expect, because algorithm complexity is n^2].
39# The reason for padding is that inputs shorter than 512 bits won't
40# be processed faster anyway, because minimal critical path of the
41# core loop happens to match 512-bit timing. Either way, it resulted
42# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
43# 1024-bit one [in comparison to original version of *this* module].
44#
45# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
46# this module is:
47#                   sign    verify    sign/s verify/s
48# rsa  512 bits 0.000290s 0.000024s   3452.8  42031.4
49# rsa 1024 bits 0.000793s 0.000058s   1261.7  17172.0
50# rsa 2048 bits 0.005908s 0.000148s    169.3   6754.0
51# rsa 4096 bits 0.033456s 0.000469s     29.9   2133.6
52# dsa  512 bits 0.000253s 0.000198s   3949.9   5057.0
53# dsa 1024 bits 0.000585s 0.000607s   1708.4   1647.4
54# dsa 2048 bits 0.001453s 0.001703s    688.1    587.4
55#
56# ... and *without* (but still with ia64.S):
57#
58# rsa  512 bits 0.000670s 0.000041s   1491.8  24145.5
59# rsa 1024 bits 0.001988s 0.000080s    502.9  12499.3
60# rsa 2048 bits 0.008702s 0.000189s    114.9   5293.9
61# rsa 4096 bits 0.043860s 0.000533s     22.8   1875.9
62# dsa  512 bits 0.000441s 0.000427s   2265.3   2340.6
63# dsa 1024 bits 0.000823s 0.000867s   1215.6   1153.2
64# dsa 2048 bits 0.001894s 0.002179s    528.1    458.9
65#
66# As it can be seen, RSA sign performance improves by 130-30%,
67# hereafter less for longer keys, while verify - by 74-13%.
68# DSA performance improves by 115-30%.
69
70# $output is the last argument if it looks like a file (it has an extension)
71$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
72
73if ($^O eq "hpux") {
74    $ADDP="addp4";
75    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
76} else { $ADDP="add"; }
77
78$code=<<___;
79.explicit
80.text
81
82// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
83//		    const BN_ULONG *bp,const BN_ULONG *np,
84//		    const BN_ULONG *n0p,int num);
85.align	64
86.global	bn_mul_mont#
87.proc	bn_mul_mont#
88bn_mul_mont:
89	.prologue
90	.body
91{ .mmi;	cmp4.le		p6,p7=2,r37;;
92(p6)	cmp4.lt.unc	p8,p9=8,r37
93	mov		ret0=r0		};;
94{ .bbb;
95(p9)	br.cond.dptk.many	bn_mul_mont_8
96(p8)	br.cond.dpnt.many	bn_mul_mont_general
97(p7)	br.ret.spnt.many	b0	};;
98.endp	bn_mul_mont#
99
100prevfs=r2;	prevpr=r3;	prevlc=r10;	prevsp=r11;
101
102rptr=r8;	aptr=r9;	bptr=r14;	nptr=r15;
103tptr=r16;	// &tp[0]
104tp_1=r17;	// &tp[-1]
105num=r18;	len=r19;	lc=r20;
106topbit=r21;	// carry bit from tmp[num]
107
108n0=f6;
109m0=f7;
110bi=f8;
111
112.align	64
113.local	bn_mul_mont_general#
114.proc	bn_mul_mont_general#
115bn_mul_mont_general:
116	.prologue
117{ .mmi;	.save	ar.pfs,prevfs
118	alloc	prevfs=ar.pfs,6,2,0,8
119	$ADDP	aptr=0,in1
120	.save	ar.lc,prevlc
121	mov	prevlc=ar.lc		}
122{ .mmi;	.vframe	prevsp
123	mov	prevsp=sp
124	$ADDP	bptr=0,in2
125	.save	pr,prevpr
126	mov	prevpr=pr		};;
127
128	.body
129	.rotf		alo[6],nlo[4],ahi[8],nhi[6]
130	.rotr		a[3],n[3],t[2]
131
132{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
133	ldf8		alo[4]=[aptr],16	// ap[0]
134	$ADDP		r30=8,in1	};;
135{ .mmi;	ldf8		alo[3]=[r30],16		// ap[1]
136	ldf8		alo[2]=[aptr],16	// ap[2]
137	$ADDP		in4=0,in4	};;
138{ .mmi;	ldf8		alo[1]=[r30]		// ap[3]
139	ldf8		n0=[in4]		// n0
140	$ADDP		rptr=0,in0		}
141{ .mmi;	$ADDP		nptr=0,in3
142	mov		r31=16
143	zxt4		num=in5		};;
144{ .mmi;	ldf8		nlo[2]=[nptr],8		// np[0]
145	shladd		len=num,3,r0
146	shladd		r31=num,3,r31	};;
147{ .mmi;	ldf8		nlo[1]=[nptr],8		// np[1]
148	add		lc=-5,num
149	sub		r31=sp,r31	};;
150{ .mfb;	and		sp=-16,r31		// alloca
151	xmpy.hu		ahi[2]=alo[4],bi	// ap[0]*bp[0]
152	nop.b		0		}
153{ .mfb;	nop.m		0
154	xmpy.lu		alo[4]=alo[4],bi
155	brp.loop.imp	.L1st_ctop,.L1st_cend-16
156					};;
157{ .mfi;	nop.m		0
158	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[0]
159	add		tp_1=8,sp	}
160{ .mfi;	nop.m		0
161	xma.lu		alo[3]=alo[3],bi,ahi[2]
162	mov		pr.rot=0x20001f<<16
163			// ------^----- (p40) at first (p23)
164			// ----------^^ p[16:20]=1
165					};;
166{ .mfi;	nop.m		0
167	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[0])*n0
168	mov		ar.lc=lc	}
169{ .mfi;	nop.m		0
170	fcvt.fxu.s1	nhi[1]=f0
171	mov		ar.ec=8		};;
172
173.align	32
174.L1st_ctop:
175.pred.rel	"mutex",p40,p42
176{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
177	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
178	(p40)	add		n[2]=n[2],a[2]		}   // (p23)					}
179{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)(p16)
180	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
181	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
182{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
183	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
184	(p42)	cmp.leu		p41,p39=n[2],a[2]   	}   // (p23)
185{ .mfi;	(p23)	st8		[tp_1]=n[2],8
186	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
187	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
188{ .mmb;	(p21)	getf.sig	n[0]=nlo[3]
189	(p16)	nop.m		0
190	br.ctop.sptk	.L1st_ctop			};;
191.L1st_cend:
192
193{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
194	getf.sig	n[0]=nhi[4]
195	add		num=-1,num	};;	// num--
196{ .mmi;	.pred.rel	"mutex",p40,p42
197(p40)	add		n[0]=n[0],a[0]
198(p42)	add		n[0]=n[0],a[0],1
199	sub		aptr=aptr,len	};;	// rewind
200{ .mmi;	.pred.rel	"mutex",p40,p42
201(p40)	cmp.ltu		p41,p39=n[0],a[0]
202(p42)	cmp.leu		p41,p39=n[0],a[0]
203	sub		nptr=nptr,len	};;
204{ .mmi;	.pred.rel	"mutex",p39,p41
205(p39)	add		topbit=r0,r0
206(p41)	add		topbit=r0,r0,1
207	nop.i		0		}
208{ .mmi;	st8		[tp_1]=n[0]
209	add		tptr=16,sp
210	add		tp_1=8,sp	};;
211
212.Louter:
213{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
214	ldf8		ahi[3]=[tptr]		// tp[0]
215	add		r30=8,aptr	};;
216{ .mmi;	ldf8		alo[4]=[aptr],16	// ap[0]
217	ldf8		alo[3]=[r30],16		// ap[1]
218	add		r31=8,nptr	};;
219{ .mfb;	ldf8		alo[2]=[aptr],16	// ap[2]
220	xma.hu		ahi[2]=alo[4],bi,ahi[3]	// ap[0]*bp[i]+tp[0]
221	brp.loop.imp	.Linner_ctop,.Linner_cend-16
222					}
223{ .mfb;	ldf8		alo[1]=[r30]		// ap[3]
224	xma.lu		alo[4]=alo[4],bi,ahi[3]
225	clrrrb.pr			};;
226{ .mfi;	ldf8		nlo[2]=[nptr],16	// np[0]
227	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[i]
228	nop.i		0		}
229{ .mfi;	ldf8		nlo[1]=[r31]		// np[1]
230	xma.lu		alo[3]=alo[3],bi,ahi[2]
231	mov		pr.rot=0x20101f<<16
232			// ------^----- (p40) at first (p23)
233			// --------^--- (p30) at first (p22)
234			// ----------^^ p[16:20]=1
235					};;
236{ .mfi;	st8		[tptr]=r0		// tp[0] is already accounted
237	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[i]+tp[0])*n0
238	mov		ar.lc=lc	}
239{ .mfi;
240	fcvt.fxu.s1	nhi[1]=f0
241	mov		ar.ec=8		};;
242
243// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
244// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
245// in latter case accounts for two-tick pipeline stall, which means
246// that its performance would be ~20% lower than optimal one. No
247// attempt was made to address this, because original Itanium is
248// hardly represented out in the wild...
249.align	32
250.Linner_ctop:
251.pred.rel	"mutex",p40,p42
252.pred.rel	"mutex",p30,p32
253{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
254	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
255	(p40)	add		n[2]=n[2],a[2]		}   // (p23)
256{ .mfi;	(p16)	nop.m		0
257	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
258	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
259{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
260	(p16)	nop.f		0
261	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
262{ .mfi;	(p21)	ld8		t[0]=[tptr],8
263	(p16)	nop.f		0
264	(p42)	cmp.leu		p41,p39=n[2],a[2]	};; // (p23)
265{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)
266	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
267	(p30)	add		a[1]=a[1],t[1]		}   // (p22)
268{ .mfi;	(p16)	nop.m		0
269	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
270	(p32)	add		a[1]=a[1],t[1],1	};; // (p22)
271{ .mmi;	(p21)	getf.sig	n[0]=nlo[3]
272	(p16)	nop.m		0
273	(p30)	cmp.ltu		p31,p29=a[1],t[1]	}   // (p22)
274{ .mmb;	(p23)	st8		[tp_1]=n[2],8
275	(p32)	cmp.leu		p31,p29=a[1],t[1]	    // (p22)
276	br.ctop.sptk	.Linner_ctop			};;
277.Linner_cend:
278
279{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
280	getf.sig	n[0]=nhi[4]
281	nop.i		0		};;
282
283{ .mmi;	.pred.rel	"mutex",p31,p33
284(p31)	add		a[0]=a[0],topbit
285(p33)	add		a[0]=a[0],topbit,1
286	mov		topbit=r0	};;
287{ .mfi; .pred.rel	"mutex",p31,p33
288(p31)	cmp.ltu		p32,p30=a[0],topbit
289(p33)	cmp.leu		p32,p30=a[0],topbit
290					}
291{ .mfi;	.pred.rel	"mutex",p40,p42
292(p40)	add		n[0]=n[0],a[0]
293(p42)	add		n[0]=n[0],a[0],1
294					};;
295{ .mmi;	.pred.rel	"mutex",p44,p46
296(p40)	cmp.ltu		p41,p39=n[0],a[0]
297(p42)	cmp.leu		p41,p39=n[0],a[0]
298(p32)	add		topbit=r0,r0,1	}
299
300{ .mmi;	st8		[tp_1]=n[0],8
301	cmp4.ne		p6,p0=1,num
302	sub		aptr=aptr,len	};;	// rewind
303{ .mmi;	sub		nptr=nptr,len
304(p41)	add		topbit=r0,r0,1
305	add		tptr=16,sp	}
306{ .mmb;	add		tp_1=8,sp
307	add		num=-1,num		// num--
308(p6)	br.cond.sptk.many	.Louter	};;
309
310{ .mbb;	add		lc=4,lc
311	brp.loop.imp	.Lsub_ctop,.Lsub_cend-16
312	clrrrb.pr			};;
313{ .mii;	nop.m		0
314	mov		pr.rot=0x10001<<16
315			// ------^---- (p33) at first (p17)
316	mov		ar.lc=lc	}
317{ .mii;	nop.m		0
318	mov		ar.ec=3
319	nop.i		0		};;
320
321.Lsub_ctop:
322.pred.rel	"mutex",p33,p35
323{ .mfi;	(p16)	ld8		t[0]=[tptr],8		    // t=*(tp++)
324	(p16)	nop.f		0
325	(p33)	sub		n[1]=t[1],n[1]		}   // (p17)
326{ .mfi;	(p16)	ld8		n[0]=[nptr],8		    // n=*(np++)
327	(p16)	nop.f		0
328	(p35)	sub		n[1]=t[1],n[1],1	};; // (p17)
329{ .mib;	(p18)	st8		[rptr]=n[2],8		    // *(rp++)=r
330	(p33)	cmp.gtu		p34,p32=n[1],t[1]	    // (p17)
331	(p18)	nop.b		0			}
332{ .mib;	(p18)	nop.m		0
333	(p35)	cmp.geu		p34,p32=n[1],t[1]	    // (p17)
334	br.ctop.sptk	.Lsub_ctop			};;
335.Lsub_cend:
336
337{ .mmb;	.pred.rel	"mutex",p34,p36
338(p34)	sub	topbit=topbit,r0	// (p19)
339(p36)	sub	topbit=topbit,r0,1
340	brp.loop.imp	.Lcopy_ctop,.Lcopy_cend-16
341					}
342{ .mmb;	sub	rptr=rptr,len		// rewind
343	sub	tptr=tptr,len
344	clrrrb.pr			};;
345{ .mmi;	mov	aptr=rptr
346	mov	bptr=tptr
347	mov	pr.rot=1<<16		};;
348{ .mii;	cmp.eq	p0,p6=topbit,r0
349	mov	ar.lc=lc
350	mov	ar.ec=2			};;
351
352.Lcopy_ctop:
353{ .mmi;	(p16)	ld8	a[0]=[aptr],8
354	(p16)	ld8	t[0]=[bptr],8
355	(p6)	mov	a[1]=t[1]	};;	// (p17)
356{ .mmb;	(p17)	st8	[rptr]=a[1],8
357	(p17)	st8	[tptr]=r0,8
358	br.ctop.sptk	.Lcopy_ctop	};;
359.Lcopy_cend:
360
361{ .mmi;	mov		ret0=1			// signal "handled"
362	rum		1<<5			// clear um.mfh
363	mov		ar.lc=prevlc	}
364{ .mib;	.restore	sp
365	mov		sp=prevsp
366	mov		pr=prevpr,0x1ffff
367	br.ret.sptk.many	b0	};;
368.endp	bn_mul_mont_general#
369
370a1=r16;  a2=r17;  a3=r18;  a4=r19;  a5=r20;  a6=r21;  a7=r22;  a8=r23;
371n1=r24;  n2=r25;  n3=r26;  n4=r27;  n5=r28;  n6=r29;  n7=r30;  n8=r31;
372t0=r15;
373
374ai0=f8;  ai1=f9;  ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
375ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
376
377.align	64
378.skip	48		// aligns loop body
379.local	bn_mul_mont_8#
380.proc	bn_mul_mont_8#
381bn_mul_mont_8:
382	.prologue
383{ .mmi;	.save		ar.pfs,prevfs
384	alloc		prevfs=ar.pfs,6,2,0,8
385	.vframe		prevsp
386	mov		prevsp=sp
387	.save		ar.lc,prevlc
388	mov		prevlc=ar.lc	}
389{ .mmi;	add		r17=-6*16,sp
390	add		sp=-7*16,sp
391	.save		pr,prevpr
392	mov		prevpr=pr	};;
393
394{ .mmi;	.save.gf	0,0x10
395	stf.spill	[sp]=f16,-16
396	.save.gf	0,0x20
397	stf.spill	[r17]=f17,32
398	add		r16=-5*16,prevsp};;
399{ .mmi;	.save.gf	0,0x40
400	stf.spill	[r16]=f18,32
401	.save.gf	0,0x80
402	stf.spill	[r17]=f19,32
403	$ADDP		aptr=0,in1	};;
404{ .mmi;	.save.gf	0,0x100
405	stf.spill	[r16]=f20,32
406	.save.gf	0,0x200
407	stf.spill	[r17]=f21,32
408	$ADDP		r29=8,in1	};;
409{ .mmi;	.save.gf	0,0x400
410	stf.spill	[r16]=f22
411	.save.gf	0,0x800
412	stf.spill	[r17]=f23
413	$ADDP		rptr=0,in0	};;
414
415	.body
416	.rotf		bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
417	.rotr		t[8]
418
419// load input vectors padding them to 8 elements
420{ .mmi;	ldf8		ai0=[aptr],16		// ap[0]
421	ldf8		ai1=[r29],16		// ap[1]
422	$ADDP		bptr=0,in2	}
423{ .mmi;	$ADDP		r30=8,in2
424	$ADDP		nptr=0,in3
425	$ADDP		r31=8,in3	};;
426{ .mmi;	ldf8		bj[7]=[bptr],16		// bp[0]
427	ldf8		bj[6]=[r30],16		// bp[1]
428	cmp4.le		p4,p5=3,in5	}
429{ .mmi;	ldf8		ni0=[nptr],16		// np[0]
430	ldf8		ni1=[r31],16		// np[1]
431	cmp4.le		p6,p7=4,in5	};;
432
433{ .mfi;	(p4)ldf8	ai2=[aptr],16		// ap[2]
434	(p5)fcvt.fxu	ai2=f0
435	cmp4.le		p8,p9=5,in5	}
436{ .mfi;	(p6)ldf8	ai3=[r29],16		// ap[3]
437	(p7)fcvt.fxu	ai3=f0
438	cmp4.le		p10,p11=6,in5	}
439{ .mfi;	(p4)ldf8	bj[5]=[bptr],16		// bp[2]
440	(p5)fcvt.fxu	bj[5]=f0
441	cmp4.le		p12,p13=7,in5	}
442{ .mfi;	(p6)ldf8	bj[4]=[r30],16		// bp[3]
443	(p7)fcvt.fxu	bj[4]=f0
444	cmp4.le		p14,p15=8,in5	}
445{ .mfi;	(p4)ldf8	ni2=[nptr],16		// np[2]
446	(p5)fcvt.fxu	ni2=f0
447	addp4		r28=-1,in5	}
448{ .mfi;	(p6)ldf8	ni3=[r31],16		// np[3]
449	(p7)fcvt.fxu	ni3=f0
450	$ADDP		in4=0,in4	};;
451
452{ .mfi;	ldf8		n0=[in4]
453	fcvt.fxu	tf[1]=f0
454	nop.i		0		}
455
456{ .mfi;	(p8)ldf8	ai4=[aptr],16		// ap[4]
457	(p9)fcvt.fxu	ai4=f0
458	mov		t[0]=r0		}
459{ .mfi;	(p10)ldf8	ai5=[r29],16		// ap[5]
460	(p11)fcvt.fxu	ai5=f0
461	mov		t[1]=r0		}
462{ .mfi;	(p8)ldf8	bj[3]=[bptr],16		// bp[4]
463	(p9)fcvt.fxu	bj[3]=f0
464	mov		t[2]=r0		}
465{ .mfi;	(p10)ldf8	bj[2]=[r30],16		// bp[5]
466	(p11)fcvt.fxu	bj[2]=f0
467	mov		t[3]=r0		}
468{ .mfi;	(p8)ldf8	ni4=[nptr],16		// np[4]
469	(p9)fcvt.fxu	ni4=f0
470	mov		t[4]=r0		}
471{ .mfi;	(p10)ldf8	ni5=[r31],16		// np[5]
472	(p11)fcvt.fxu	ni5=f0
473	mov		t[5]=r0		};;
474
475{ .mfi;	(p12)ldf8	ai6=[aptr],16		// ap[6]
476	(p13)fcvt.fxu	ai6=f0
477	mov		t[6]=r0		}
478{ .mfi;	(p14)ldf8	ai7=[r29],16		// ap[7]
479	(p15)fcvt.fxu	ai7=f0
480	mov		t[7]=r0		}
481{ .mfi;	(p12)ldf8	bj[1]=[bptr],16		// bp[6]
482	(p13)fcvt.fxu	bj[1]=f0
483	mov		ar.lc=r28	}
484{ .mfi;	(p14)ldf8	bj[0]=[r30],16		// bp[7]
485	(p15)fcvt.fxu	bj[0]=f0
486	mov		ar.ec=1		}
487{ .mfi;	(p12)ldf8	ni6=[nptr],16		// np[6]
488	(p13)fcvt.fxu	ni6=f0
489	mov		pr.rot=1<<16	}
490{ .mfb;	(p14)ldf8	ni7=[r31],16		// np[7]
491	(p15)fcvt.fxu	ni7=f0
492	brp.loop.imp	.Louter_8_ctop,.Louter_8_cend-16
493					};;
494
495// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
496// to measure with help of Interval Time Counter indicated that the
497// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
498// addressing the issue is problematic, because I don't have access
499// to platform-specific instruction-level profiler. On Itanium it
500// should run in 56*n ticks, because of higher xma latency...
501.Louter_8_ctop:
502	.pred.rel		"mutex",p40,p42
503	.pred.rel		"mutex",p48,p50
504{ .mfi;	(p16)	nop.m		0			// 0:
505	(p16)	xma.hu		ahi[0]=ai0,bj[7],tf[1]	//	ap[0]*b[i]+t[0]
506	(p40)	add		a3=a3,n3	}	//	(p17) a3+=n3
507{ .mfi;	(p42)	add		a3=a3,n3,1
508	(p16)	xma.lu		alo[0]=ai0,bj[7],tf[1]
509	(p16)	nop.i		0		};;
510{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
511	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
512	(p50)	add		t[6]=t[6],a3,1	};;
513{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
514	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
515	(p40)	cmp.ltu		p43,p41=a3,n3	}
516{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
517	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
518	(p16)	nop.i		0		};;
519{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
520	(p48)	cmp.ltu		p51,p49=t[6],a3
521	(p50)	cmp.leu		p51,p49=t[6],a3	};;
522	.pred.rel		"mutex",p41,p43
523	.pred.rel		"mutex",p49,p51
524{ .mfi;	(p16)	nop.m		0			// 4:
525	(p16)	xma.hu		ahi[1]=ai1,bj[7],ahi[0]	//	ap[1]*b[i]
526	(p41)	add		a4=a4,n4	}	//	(p17) a4+=n4
527{ .mfi;	(p43)	add		a4=a4,n4,1
528	(p16)	xma.lu		alo[1]=ai1,bj[7],ahi[0]
529	(p16)	nop.i		0		};;
530{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
531	(p16)	xmpy.lu		mj[0]=alo[0],n0		//	(ap[0]*b[i]+t[0])*n0
532	(p51)	add		t[5]=t[5],a4,1	};;
533{ .mfi;	(p16)	nop.m		0			// 6:
534	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
535	(p41)	cmp.ltu		p42,p40=a4,n4	}
536{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
537	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
538	(p16)	nop.i		0		};;
539{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
540	(p49)	cmp.ltu		p50,p48=t[5],a4
541	(p51)	cmp.leu		p50,p48=t[5],a4	};;
542	.pred.rel		"mutex",p40,p42
543	.pred.rel		"mutex",p48,p50
544{ .mfi;	(p16)	nop.m		0			// 8:
545	(p16)	xma.hu		ahi[2]=ai2,bj[7],ahi[1]	//	ap[2]*b[i]
546	(p40)	add		a5=a5,n5	}	//	(p17) a5+=n5
547{ .mfi;	(p42)	add		a5=a5,n5,1
548	(p16)	xma.lu		alo[2]=ai2,bj[7],ahi[1]
549	(p16)	nop.i		0		};;
550{ .mii;	(p16)	getf.sig	a1=alo[1]		// 9:
551	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
552	(p50)	add		t[4]=t[4],a5,1	};;
553{ .mfi;	(p16)	nop.m		0			// 10:
554	(p16)	xma.hu		nhi[0]=ni0,mj[0],alo[0]	//	np[0]*m0
555	(p40)	cmp.ltu		p43,p41=a5,n5	}
556{ .mfi;	(p42)	cmp.leu		p43,p41=a5,n5
557	(p16)	xma.lu		nlo[0]=ni0,mj[0],alo[0]
558	(p16)	nop.i		0		};;
559{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
560	(p48)	cmp.ltu		p51,p49=t[4],a5
561	(p50)	cmp.leu		p51,p49=t[4],a5	};;
562	.pred.rel		"mutex",p41,p43
563	.pred.rel		"mutex",p49,p51
564{ .mfi;	(p17)	getf.sig	n8=nhi[8]		// 12:
565	(p16)	xma.hu		ahi[3]=ai3,bj[7],ahi[2]	//	ap[3]*b[i]
566	(p41)	add		a6=a6,n6	}	//	(p17) a6+=n6
567{ .mfi;	(p43)	add		a6=a6,n6,1
568	(p16)	xma.lu		alo[3]=ai3,bj[7],ahi[2]
569	(p16)	nop.i		0		};;
570{ .mii;	(p16)	getf.sig	a2=alo[2]		// 13:
571	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
572	(p51)	add		t[3]=t[3],a6,1	};;
573{ .mfi;	(p16)	nop.m		0			// 14:
574	(p16)	xma.hu		nhi[1]=ni1,mj[0],nhi[0]	//	np[1]*m0
575	(p41)	cmp.ltu		p42,p40=a6,n6	}
576{ .mfi;	(p43)	cmp.leu		p42,p40=a6,n6
577	(p16)	xma.lu		nlo[1]=ni1,mj[0],nhi[0]
578	(p16)	nop.i		0		};;
579{ .mii;	(p16)	nop.m		0			// 15:
580	(p49)	cmp.ltu		p50,p48=t[3],a6
581	(p51)	cmp.leu		p50,p48=t[3],a6	};;
582	.pred.rel		"mutex",p40,p42
583	.pred.rel		"mutex",p48,p50
584{ .mfi;	(p16)	nop.m		0			// 16:
585	(p16)	xma.hu		ahi[4]=ai4,bj[7],ahi[3]	//	ap[4]*b[i]
586	(p40)	add		a7=a7,n7	}	//	(p17) a7+=n7
587{ .mfi;	(p42)	add		a7=a7,n7,1
588	(p16)	xma.lu		alo[4]=ai4,bj[7],ahi[3]
589	(p16)	nop.i		0		};;
590{ .mii;	(p16)	getf.sig	a3=alo[3]		// 17:
591	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
592	(p50)	add		t[2]=t[2],a7,1	};;
593{ .mfi;	(p16)	nop.m		0			// 18:
594	(p16)	xma.hu		nhi[2]=ni2,mj[0],nhi[1]	//	np[2]*m0
595	(p40)	cmp.ltu		p43,p41=a7,n7	}
596{ .mfi;	(p42)	cmp.leu		p43,p41=a7,n7
597	(p16)	xma.lu		nlo[2]=ni2,mj[0],nhi[1]
598	(p16)	nop.i		0		};;
599{ .mii;	(p16)	getf.sig	n1=nlo[1]		// 19:
600	(p48)	cmp.ltu		p51,p49=t[2],a7
601	(p50)	cmp.leu		p51,p49=t[2],a7	};;
602	.pred.rel		"mutex",p41,p43
603	.pred.rel		"mutex",p49,p51
604{ .mfi;	(p16)	nop.m		0			// 20:
605	(p16)	xma.hu		ahi[5]=ai5,bj[7],ahi[4]	//	ap[5]*b[i]
606	(p41)	add		a8=a8,n8	}	//	(p17) a8+=n8
607{ .mfi;	(p43)	add		a8=a8,n8,1
608	(p16)	xma.lu		alo[5]=ai5,bj[7],ahi[4]
609	(p16)	nop.i		0		};;
610{ .mii;	(p16)	getf.sig	a4=alo[4]		// 21:
611	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
612	(p51)	add		t[1]=t[1],a8,1	};;
613{ .mfi;	(p16)	nop.m		0			// 22:
614	(p16)	xma.hu		nhi[3]=ni3,mj[0],nhi[2]	//	np[3]*m0
615	(p41)	cmp.ltu		p42,p40=a8,n8	}
616{ .mfi;	(p43)	cmp.leu		p42,p40=a8,n8
617	(p16)	xma.lu		nlo[3]=ni3,mj[0],nhi[2]
618	(p16)	nop.i		0		};;
619{ .mii;	(p16)	getf.sig	n2=nlo[2]		// 23:
620	(p49)	cmp.ltu		p50,p48=t[1],a8
621	(p51)	cmp.leu		p50,p48=t[1],a8	};;
622{ .mfi;	(p16)	nop.m		0			// 24:
623	(p16)	xma.hu		ahi[6]=ai6,bj[7],ahi[5]	//	ap[6]*b[i]
624	(p16)	add		a1=a1,n1	}	//	(p16) a1+=n1
625{ .mfi;	(p16)	nop.m		0
626	(p16)	xma.lu		alo[6]=ai6,bj[7],ahi[5]
627	(p17)	mov		t[0]=r0		};;
628{ .mii;	(p16)	getf.sig	a5=alo[5]		// 25:
629	(p16)	add		t0=t[7],a1		//	(p16) t[7]+=a1
630	(p42)	add		t[0]=t[0],r0,1	};;
631{ .mfi;	(p16)	setf.sig	tf[0]=t0		// 26:
632	(p16)	xma.hu		nhi[4]=ni4,mj[0],nhi[3]	//	np[4]*m0
633	(p50)	add		t[0]=t[0],r0,1	}
634{ .mfi;	(p16)	cmp.ltu.unc	p42,p40=a1,n1
635	(p16)	xma.lu		nlo[4]=ni4,mj[0],nhi[3]
636	(p16)	nop.i		0		};;
637{ .mii;	(p16)	getf.sig	n3=nlo[3]		// 27:
638	(p16)	cmp.ltu.unc	p50,p48=t0,a1
639	(p16)	nop.i		0		};;
640	.pred.rel		"mutex",p40,p42
641	.pred.rel		"mutex",p48,p50
642{ .mfi;	(p16)	nop.m		0			// 28:
643	(p16)	xma.hu		ahi[7]=ai7,bj[7],ahi[6]	//	ap[7]*b[i]
644	(p40)	add		a2=a2,n2	}	//	(p16) a2+=n2
645{ .mfi;	(p42)	add		a2=a2,n2,1
646	(p16)	xma.lu		alo[7]=ai7,bj[7],ahi[6]
647	(p16)	nop.i		0		};;
648{ .mii;	(p16)	getf.sig	a6=alo[6]		// 29:
649	(p48)	add		t[6]=t[6],a2		//	(p16) t[6]+=a2
650	(p50)	add		t[6]=t[6],a2,1	};;
651{ .mfi;	(p16)	nop.m		0			// 30:
652	(p16)	xma.hu		nhi[5]=ni5,mj[0],nhi[4]	//	np[5]*m0
653	(p40)	cmp.ltu		p41,p39=a2,n2	}
654{ .mfi;	(p42)	cmp.leu		p41,p39=a2,n2
655	(p16)	xma.lu		nlo[5]=ni5,mj[0],nhi[4]
656	(p16)	nop.i		0		};;
657{ .mfi;	(p16)	getf.sig	n4=nlo[4]		// 31:
658	(p16)	nop.f		0
659	(p48)	cmp.ltu		p49,p47=t[6],a2	}
660{ .mfb;	(p50)	cmp.leu		p49,p47=t[6],a2
661	(p16)	nop.f		0
662	br.ctop.sptk.many	.Louter_8_ctop	};;
663.Louter_8_cend:
664
665// above loop has to execute one more time, without (p16), which is
666// replaced with merged move of np[8] to GPR bank
667	.pred.rel		"mutex",p40,p42
668	.pred.rel		"mutex",p48,p50
669{ .mmi;	(p0)	getf.sig	n1=ni0			// 0:
670	(p40)	add		a3=a3,n3		//	(p17) a3+=n3
671	(p42)	add		a3=a3,n3,1	};;
672{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
673	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
674	(p50)	add		t[6]=t[6],a3,1	};;
675{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
676	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
677	(p40)	cmp.ltu		p43,p41=a3,n3	}
678{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
679	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
680	(p0)	nop.i		0		};;
681{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
682	(p48)	cmp.ltu		p51,p49=t[6],a3
683	(p50)	cmp.leu		p51,p49=t[6],a3	};;
684	.pred.rel		"mutex",p41,p43
685	.pred.rel		"mutex",p49,p51
686{ .mmi;	(p0)	getf.sig	n2=ni1			// 4:
687	(p41)	add		a4=a4,n4		//	(p17) a4+=n4
688	(p43)	add		a4=a4,n4,1	};;
689{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
690	(p0)	nop.f		0
691	(p51)	add		t[5]=t[5],a4,1	};;
692{ .mfi;	(p0)	getf.sig	n3=ni2			// 6:
693	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
694	(p41)	cmp.ltu		p42,p40=a4,n4	}
695{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
696	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
697	(p0)	nop.i		0		};;
698{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
699	(p49)	cmp.ltu		p50,p48=t[5],a4
700	(p51)	cmp.leu		p50,p48=t[5],a4	};;
701	.pred.rel		"mutex",p40,p42
702	.pred.rel		"mutex",p48,p50
703{ .mii;	(p0)	getf.sig	n4=ni3			// 8:
704	(p40)	add		a5=a5,n5		//	(p17) a5+=n5
705	(p42)	add		a5=a5,n5,1	};;
706{ .mii;	(p0)	nop.m		0			// 9:
707	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
708	(p50)	add		t[4]=t[4],a5,1	};;
709{ .mii;	(p0)	nop.m		0			// 10:
710	(p40)	cmp.ltu		p43,p41=a5,n5
711	(p42)	cmp.leu		p43,p41=a5,n5	};;
712{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
713	(p48)	cmp.ltu		p51,p49=t[4],a5
714	(p50)	cmp.leu		p51,p49=t[4],a5	};;
715	.pred.rel		"mutex",p41,p43
716	.pred.rel		"mutex",p49,p51
717{ .mii;	(p17)	getf.sig	n8=nhi[8]		// 12:
718	(p41)	add		a6=a6,n6		//	(p17) a6+=n6
719	(p43)	add		a6=a6,n6,1	};;
720{ .mii;	(p0)	getf.sig	n5=ni4			// 13:
721	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
722	(p51)	add		t[3]=t[3],a6,1	};;
723{ .mii;	(p0)	nop.m		0			// 14:
724	(p41)	cmp.ltu		p42,p40=a6,n6
725	(p43)	cmp.leu		p42,p40=a6,n6	};;
726{ .mii;	(p0)	getf.sig	n6=ni5			// 15:
727	(p49)	cmp.ltu		p50,p48=t[3],a6
728	(p51)	cmp.leu		p50,p48=t[3],a6	};;
729	.pred.rel		"mutex",p40,p42
730	.pred.rel		"mutex",p48,p50
731{ .mii;	(p0)	nop.m		0			// 16:
732	(p40)	add		a7=a7,n7		//	(p17) a7+=n7
733	(p42)	add		a7=a7,n7,1	};;
734{ .mii;	(p0)	nop.m		0			// 17:
735	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
736	(p50)	add		t[2]=t[2],a7,1	};;
737{ .mii;	(p0)	nop.m		0			// 18:
738	(p40)	cmp.ltu		p43,p41=a7,n7
739	(p42)	cmp.leu		p43,p41=a7,n7	};;
740{ .mii;	(p0)	getf.sig	n7=ni6			// 19:
741	(p48)	cmp.ltu		p51,p49=t[2],a7
742	(p50)	cmp.leu		p51,p49=t[2],a7	};;
743	.pred.rel		"mutex",p41,p43
744	.pred.rel		"mutex",p49,p51
745{ .mii;	(p0)	nop.m		0			// 20:
746	(p41)	add		a8=a8,n8		//	(p17) a8+=n8
747	(p43)	add		a8=a8,n8,1	};;
748{ .mmi;	(p0)	nop.m		0			// 21:
749	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
750	(p51)	add		t[1]=t[1],a8,1	}
751{ .mmi;	(p17)	mov		t[0]=r0
752	(p41)	cmp.ltu		p42,p40=a8,n8
753	(p43)	cmp.leu		p42,p40=a8,n8	};;
754{ .mmi;	(p0)	getf.sig	n8=ni7			// 22:
755	(p49)	cmp.ltu		p50,p48=t[1],a8
756	(p51)	cmp.leu		p50,p48=t[1],a8	}
757{ .mmi;	(p42)	add		t[0]=t[0],r0,1
758	(p0)	add		r16=-7*16,prevsp
759	(p0)	add		r17=-6*16,prevsp	};;
760
761// subtract np[8] from carrybit|tmp[8]
762// carrybit|tmp[8] layout upon exit from above loop is:
763//	t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
764{ .mmi;	(p50)add	t[0]=t[0],r0,1
765	add		r18=-5*16,prevsp
766	sub		n1=t0,n1	};;
767{ .mmi;	cmp.gtu		p34,p32=n1,t0;;
768	.pred.rel	"mutex",p32,p34
769	(p32)sub	n2=t[7],n2
770	(p34)sub	n2=t[7],n2,1	};;
771{ .mii;	(p32)cmp.gtu	p35,p33=n2,t[7]
772	(p34)cmp.geu	p35,p33=n2,t[7];;
773	.pred.rel	"mutex",p33,p35
774	(p33)sub	n3=t[6],n3	}
775{ .mmi;	(p35)sub	n3=t[6],n3,1;;
776	(p33)cmp.gtu	p34,p32=n3,t[6]
777	(p35)cmp.geu	p34,p32=n3,t[6]	};;
778	.pred.rel	"mutex",p32,p34
779{ .mii;	(p32)sub	n4=t[5],n4
780	(p34)sub	n4=t[5],n4,1;;
781	(p32)cmp.gtu	p35,p33=n4,t[5]	}
782{ .mmi;	(p34)cmp.geu	p35,p33=n4,t[5];;
783	.pred.rel	"mutex",p33,p35
784	(p33)sub	n5=t[4],n5
785	(p35)sub	n5=t[4],n5,1	};;
786{ .mii;	(p33)cmp.gtu	p34,p32=n5,t[4]
787	(p35)cmp.geu	p34,p32=n5,t[4];;
788	.pred.rel	"mutex",p32,p34
789	(p32)sub	n6=t[3],n6	}
790{ .mmi;	(p34)sub	n6=t[3],n6,1;;
791	(p32)cmp.gtu	p35,p33=n6,t[3]
792	(p34)cmp.geu	p35,p33=n6,t[3]	};;
793	.pred.rel	"mutex",p33,p35
794{ .mii;	(p33)sub	n7=t[2],n7
795	(p35)sub	n7=t[2],n7,1;;
796	(p33)cmp.gtu	p34,p32=n7,t[2]	}
797{ .mmi;	(p35)cmp.geu	p34,p32=n7,t[2];;
798	.pred.rel	"mutex",p32,p34
799	(p32)sub	n8=t[1],n8
800	(p34)sub	n8=t[1],n8,1	};;
801{ .mii;	(p32)cmp.gtu	p35,p33=n8,t[1]
802	(p34)cmp.geu	p35,p33=n8,t[1];;
803	.pred.rel	"mutex",p33,p35
804	(p33)sub	a8=t[0],r0	}
805{ .mmi;	(p35)sub	a8=t[0],r0,1;;
806	(p33)cmp.gtu	p34,p32=a8,t[0]
807	(p35)cmp.geu	p34,p32=a8,t[0]	};;
808
809// save the result, either tmp[num] or tmp[num]-np[num]
810	.pred.rel	"mutex",p32,p34
811{ .mmi;	(p32)st8	[rptr]=n1,8
812	(p34)st8	[rptr]=t0,8
813	add		r19=-4*16,prevsp};;
814{ .mmb;	(p32)st8	[rptr]=n2,8
815	(p34)st8	[rptr]=t[7],8
816	(p5)br.cond.dpnt.few	.Ldone	};;
817{ .mmb;	(p32)st8	[rptr]=n3,8
818	(p34)st8	[rptr]=t[6],8
819	(p7)br.cond.dpnt.few	.Ldone	};;
820{ .mmb;	(p32)st8	[rptr]=n4,8
821	(p34)st8	[rptr]=t[5],8
822	(p9)br.cond.dpnt.few	.Ldone	};;
823{ .mmb;	(p32)st8	[rptr]=n5,8
824	(p34)st8	[rptr]=t[4],8
825	(p11)br.cond.dpnt.few	.Ldone	};;
826{ .mmb;	(p32)st8	[rptr]=n6,8
827	(p34)st8	[rptr]=t[3],8
828	(p13)br.cond.dpnt.few	.Ldone	};;
829{ .mmb;	(p32)st8	[rptr]=n7,8
830	(p34)st8	[rptr]=t[2],8
831	(p15)br.cond.dpnt.few	.Ldone	};;
832{ .mmb;	(p32)st8	[rptr]=n8,8
833	(p34)st8	[rptr]=t[1],8
834	nop.b		0		};;
835.Ldone:						// epilogue
836{ .mmi;	ldf.fill	f16=[r16],64
837	ldf.fill	f17=[r17],64
838	nop.i		0		}
839{ .mmi;	ldf.fill	f18=[r18],64
840	ldf.fill	f19=[r19],64
841	mov		pr=prevpr,0x1ffff	};;
842{ .mmi;	ldf.fill	f20=[r16]
843	ldf.fill	f21=[r17]
844	mov		ar.lc=prevlc	}
845{ .mmi;	ldf.fill	f22=[r18]
846	ldf.fill	f23=[r19]
847	mov		ret0=1		}	// signal "handled"
848{ .mib;	rum		1<<5
849	.restore	sp
850	mov		sp=prevsp
851	br.ret.sptk.many	b0	};;
852.endp	bn_mul_mont_8#
853
854.type	copyright#,\@object
855copyright:
856stringz	"Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
857___
858
859open STDOUT,">$output" if $output;
860print $code;
861close STDOUT or die "error closing STDOUT: $!";
862