1#! /usr/bin/env perl
2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3# Copyright (c) 2014, Intel Corporation. All Rights Reserved.
4# Copyright (c) 2015 CloudFlare, Inc.
5#
6# Licensed under the Apache License 2.0 (the "License").  You may not use
7# this file except in compliance with the License.  You can obtain a copy
8# in the file LICENSE in the source distribution or at
9# https://www.openssl.org/source/license.html
10#
11# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
12# (1) Intel Corporation, Israel Development Center, Haifa, Israel
13# (2) University of Haifa, Israel
14# (3) CloudFlare, Inc.
15#
16# Reference:
17# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
18#                          256 Bit Primes"
19
20# Further optimization by <appro@openssl.org>:
21#
22#		this/original	with/without -DECP_NISTZ256_ASM(*)
23# Opteron	+15-49%		+150-195%
24# Bulldozer	+18-45%		+175-240%
25# P4		+24-46%		+100-150%
26# Westmere	+18-34%		+87-160%
27# Sandy Bridge	+14-35%		+120-185%
28# Ivy Bridge	+11-35%		+125-180%
29# Haswell	+10-37%		+160-200%
30# Broadwell	+24-58%		+210-270%
31# Atom		+20-50%		+180-240%
32# VIA Nano	+50-160%	+480-480%
33#
34# (*)	"without -DECP_NISTZ256_ASM" refers to build with
35#	"enable-ec_nistp_64_gcc_128";
36#
37# Ranges denote minimum and maximum improvement coefficients depending
38# on benchmark. In "this/original" column lower coefficient is for
39# ECDSA sign, while in "with/without" - for ECDH key agreement, and
40# higher - for ECDSA sign, relatively fastest server-side operation.
41# Keep in mind that +100% means 2x improvement.
42
43# $output is the last argument if it looks like a file (it has an extension)
44# $flavour is the first argument if it doesn't look like a file
45$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47
48$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
49
50$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
52( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
53die "can't locate x86_64-xlate.pl";
54
55open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
56    or die "can't call $xlate: $!";
57*STDOUT=*OUT;
58
59if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
60		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
61	$avx = ($1>=2.19) + ($1>=2.22);
62	$addx = ($1>=2.23);
63}
64
65if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
66	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
67	$avx = ($1>=2.09) + ($1>=2.10);
68	$addx = ($1>=2.10);
69}
70
71if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
72	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
73	$avx = ($1>=10) + ($1>=11);
74	$addx = ($1>=12);
75}
76
77if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
78	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
79	$avx = ($ver>=3.0) + ($ver>=3.01);
80	$addx = ($ver>=3.03);
81}
82
83$code.=<<___;
84.text
85.extern	OPENSSL_ia32cap_P
86
87# The polynomial
88.section .rodata align=4096
89.align 64
90.Lpoly:
91.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
92
93# 2^512 mod P precomputed for NIST P256 polynomial
94.LRR:
95.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
96
97.LOne:
98.long 1,1,1,1,1,1,1,1
99.LTwo:
100.long 2,2,2,2,2,2,2,2
101.LThree:
102.long 3,3,3,3,3,3,3,3
103.LONE_mont:
104.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
105
106# Constants for computations modulo ord(p256)
107.Lord:
108.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
109.LordK:
110.quad 0xccd1c8aaee00bc4f
111.previous
112___
113
114{
115################################################################################
116# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
117
118my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
119my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
120my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
121
122$code.=<<___;
123
124.globl	ecp_nistz256_mul_by_2
125.type	ecp_nistz256_mul_by_2,\@function,2
126.align	64
127ecp_nistz256_mul_by_2:
128.cfi_startproc
129	push	%r12
130.cfi_push	%r12
131	push	%r13
132.cfi_push	%r13
133.Lmul_by_2_body:
134
135	mov	8*0($a_ptr), $a0
136	xor	$t4,$t4
137	mov	8*1($a_ptr), $a1
138	add	$a0, $a0		# a0:a3+a0:a3
139	mov	8*2($a_ptr), $a2
140	adc	$a1, $a1
141	mov	8*3($a_ptr), $a3
142	lea	.Lpoly(%rip), $a_ptr
143	 mov	$a0, $t0
144	adc	$a2, $a2
145	adc	$a3, $a3
146	 mov	$a1, $t1
147	adc	\$0, $t4
148
149	sub	8*0($a_ptr), $a0
150	 mov	$a2, $t2
151	sbb	8*1($a_ptr), $a1
152	sbb	8*2($a_ptr), $a2
153	 mov	$a3, $t3
154	sbb	8*3($a_ptr), $a3
155	sbb	\$0, $t4
156
157	cmovc	$t0, $a0
158	cmovc	$t1, $a1
159	mov	$a0, 8*0($r_ptr)
160	cmovc	$t2, $a2
161	mov	$a1, 8*1($r_ptr)
162	cmovc	$t3, $a3
163	mov	$a2, 8*2($r_ptr)
164	mov	$a3, 8*3($r_ptr)
165
166	mov	0(%rsp),%r13
167.cfi_restore	%r13
168	mov	8(%rsp),%r12
169.cfi_restore	%r12
170	lea	16(%rsp),%rsp
171.cfi_adjust_cfa_offset	-16
172.Lmul_by_2_epilogue:
173	ret
174.cfi_endproc
175.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
176
177################################################################################
178# void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]);
179.globl	ecp_nistz256_div_by_2
180.type	ecp_nistz256_div_by_2,\@function,2
181.align	32
182ecp_nistz256_div_by_2:
183.cfi_startproc
184	push	%r12
185.cfi_push	%r12
186	push	%r13
187.cfi_push	%r13
188.Ldiv_by_2_body:
189
190	mov	8*0($a_ptr), $a0
191	mov	8*1($a_ptr), $a1
192	mov	8*2($a_ptr), $a2
193	 mov	$a0, $t0
194	mov	8*3($a_ptr), $a3
195	lea	.Lpoly(%rip), $a_ptr
196
197	 mov	$a1, $t1
198	xor	$t4, $t4
199	add	8*0($a_ptr), $a0
200	 mov	$a2, $t2
201	adc	8*1($a_ptr), $a1
202	adc	8*2($a_ptr), $a2
203	 mov	$a3, $t3
204	adc	8*3($a_ptr), $a3
205	adc	\$0, $t4
206	xor	$a_ptr, $a_ptr		# borrow $a_ptr
207	test	\$1, $t0
208
209	cmovz	$t0, $a0
210	cmovz	$t1, $a1
211	cmovz	$t2, $a2
212	cmovz	$t3, $a3
213	cmovz	$a_ptr, $t4
214
215	mov	$a1, $t0		# a0:a3>>1
216	shr	\$1, $a0
217	shl	\$63, $t0
218	mov	$a2, $t1
219	shr	\$1, $a1
220	or	$t0, $a0
221	shl	\$63, $t1
222	mov	$a3, $t2
223	shr	\$1, $a2
224	or	$t1, $a1
225	shl	\$63, $t2
226	shr	\$1, $a3
227	shl	\$63, $t4
228	or	$t2, $a2
229	or	$t4, $a3
230
231	mov	$a0, 8*0($r_ptr)
232	mov	$a1, 8*1($r_ptr)
233	mov	$a2, 8*2($r_ptr)
234	mov	$a3, 8*3($r_ptr)
235
236	mov	0(%rsp),%r13
237.cfi_restore	%r13
238	mov	8(%rsp),%r12
239.cfi_restore	%r12
240	lea	16(%rsp),%rsp
241.cfi_adjust_cfa_offset	-16
242.Ldiv_by_2_epilogue:
243	ret
244.cfi_endproc
245.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
246
247################################################################################
248# void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]);
249.globl	ecp_nistz256_mul_by_3
250.type	ecp_nistz256_mul_by_3,\@function,2
251.align	32
252ecp_nistz256_mul_by_3:
253.cfi_startproc
254	push	%r12
255.cfi_push	%r12
256	push	%r13
257.cfi_push	%r13
258.Lmul_by_3_body:
259
260	mov	8*0($a_ptr), $a0
261	xor	$t4, $t4
262	mov	8*1($a_ptr), $a1
263	add	$a0, $a0		# a0:a3+a0:a3
264	mov	8*2($a_ptr), $a2
265	adc	$a1, $a1
266	mov	8*3($a_ptr), $a3
267	 mov	$a0, $t0
268	adc	$a2, $a2
269	adc	$a3, $a3
270	 mov	$a1, $t1
271	adc	\$0, $t4
272
273	sub	\$-1, $a0
274	 mov	$a2, $t2
275	sbb	.Lpoly+8*1(%rip), $a1
276	sbb	\$0, $a2
277	 mov	$a3, $t3
278	sbb	.Lpoly+8*3(%rip), $a3
279	sbb	\$0, $t4
280
281	cmovc	$t0, $a0
282	cmovc	$t1, $a1
283	cmovc	$t2, $a2
284	cmovc	$t3, $a3
285
286	xor	$t4, $t4
287	add	8*0($a_ptr), $a0	# a0:a3+=a_ptr[0:3]
288	adc	8*1($a_ptr), $a1
289	 mov	$a0, $t0
290	adc	8*2($a_ptr), $a2
291	adc	8*3($a_ptr), $a3
292	 mov	$a1, $t1
293	adc	\$0, $t4
294
295	sub	\$-1, $a0
296	 mov	$a2, $t2
297	sbb	.Lpoly+8*1(%rip), $a1
298	sbb	\$0, $a2
299	 mov	$a3, $t3
300	sbb	.Lpoly+8*3(%rip), $a3
301	sbb	\$0, $t4
302
303	cmovc	$t0, $a0
304	cmovc	$t1, $a1
305	mov	$a0, 8*0($r_ptr)
306	cmovc	$t2, $a2
307	mov	$a1, 8*1($r_ptr)
308	cmovc	$t3, $a3
309	mov	$a2, 8*2($r_ptr)
310	mov	$a3, 8*3($r_ptr)
311
312	mov	0(%rsp),%r13
313.cfi_restore	%r13
314	mov	8(%rsp),%r12
315.cfi_restore	%r12
316	lea	16(%rsp),%rsp
317.cfi_adjust_cfa_offset	-16
318.Lmul_by_3_epilogue:
319	ret
320.cfi_endproc
321.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
322
323################################################################################
324# void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
325.globl	ecp_nistz256_add
326.type	ecp_nistz256_add,\@function,3
327.align	32
328ecp_nistz256_add:
329.cfi_startproc
330	push	%r12
331.cfi_push	%r12
332	push	%r13
333.cfi_push	%r13
334.Ladd_body:
335
336	mov	8*0($a_ptr), $a0
337	xor	$t4, $t4
338	mov	8*1($a_ptr), $a1
339	mov	8*2($a_ptr), $a2
340	mov	8*3($a_ptr), $a3
341	lea	.Lpoly(%rip), $a_ptr
342
343	add	8*0($b_ptr), $a0
344	adc	8*1($b_ptr), $a1
345	 mov	$a0, $t0
346	adc	8*2($b_ptr), $a2
347	adc	8*3($b_ptr), $a3
348	 mov	$a1, $t1
349	adc	\$0, $t4
350
351	sub	8*0($a_ptr), $a0
352	 mov	$a2, $t2
353	sbb	8*1($a_ptr), $a1
354	sbb	8*2($a_ptr), $a2
355	 mov	$a3, $t3
356	sbb	8*3($a_ptr), $a3
357	sbb	\$0, $t4
358
359	cmovc	$t0, $a0
360	cmovc	$t1, $a1
361	mov	$a0, 8*0($r_ptr)
362	cmovc	$t2, $a2
363	mov	$a1, 8*1($r_ptr)
364	cmovc	$t3, $a3
365	mov	$a2, 8*2($r_ptr)
366	mov	$a3, 8*3($r_ptr)
367
368	mov	0(%rsp),%r13
369.cfi_restore	%r13
370	mov	8(%rsp),%r12
371.cfi_restore	%r12
372	lea	16(%rsp),%rsp
373.cfi_adjust_cfa_offset	-16
374.Ladd_epilogue:
375	ret
376.cfi_endproc
377.size	ecp_nistz256_add,.-ecp_nistz256_add
378
379################################################################################
380# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
381.globl	ecp_nistz256_sub
382.type	ecp_nistz256_sub,\@function,3
383.align	32
384ecp_nistz256_sub:
385.cfi_startproc
386	push	%r12
387.cfi_push	%r12
388	push	%r13
389.cfi_push	%r13
390.Lsub_body:
391
392	mov	8*0($a_ptr), $a0
393	xor	$t4, $t4
394	mov	8*1($a_ptr), $a1
395	mov	8*2($a_ptr), $a2
396	mov	8*3($a_ptr), $a3
397	lea	.Lpoly(%rip), $a_ptr
398
399	sub	8*0($b_ptr), $a0
400	sbb	8*1($b_ptr), $a1
401	 mov	$a0, $t0
402	sbb	8*2($b_ptr), $a2
403	sbb	8*3($b_ptr), $a3
404	 mov	$a1, $t1
405	sbb	\$0, $t4
406
407	add	8*0($a_ptr), $a0
408	 mov	$a2, $t2
409	adc	8*1($a_ptr), $a1
410	adc	8*2($a_ptr), $a2
411	 mov	$a3, $t3
412	adc	8*3($a_ptr), $a3
413	test	$t4, $t4
414
415	cmovz	$t0, $a0
416	cmovz	$t1, $a1
417	mov	$a0, 8*0($r_ptr)
418	cmovz	$t2, $a2
419	mov	$a1, 8*1($r_ptr)
420	cmovz	$t3, $a3
421	mov	$a2, 8*2($r_ptr)
422	mov	$a3, 8*3($r_ptr)
423
424	mov	0(%rsp),%r13
425.cfi_restore	%r13
426	mov	8(%rsp),%r12
427.cfi_restore	%r12
428	lea	16(%rsp),%rsp
429.cfi_adjust_cfa_offset	-16
430.Lsub_epilogue:
431	ret
432.cfi_endproc
433.size	ecp_nistz256_sub,.-ecp_nistz256_sub
434
435################################################################################
436# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
437.globl	ecp_nistz256_neg
438.type	ecp_nistz256_neg,\@function,2
439.align	32
440ecp_nistz256_neg:
441.cfi_startproc
442	push	%r12
443.cfi_push	%r12
444	push	%r13
445.cfi_push	%r13
446.Lneg_body:
447
448	xor	$a0, $a0
449	xor	$a1, $a1
450	xor	$a2, $a2
451	xor	$a3, $a3
452	xor	$t4, $t4
453
454	sub	8*0($a_ptr), $a0
455	sbb	8*1($a_ptr), $a1
456	sbb	8*2($a_ptr), $a2
457	 mov	$a0, $t0
458	sbb	8*3($a_ptr), $a3
459	lea	.Lpoly(%rip), $a_ptr
460	 mov	$a1, $t1
461	sbb	\$0, $t4
462
463	add	8*0($a_ptr), $a0
464	 mov	$a2, $t2
465	adc	8*1($a_ptr), $a1
466	adc	8*2($a_ptr), $a2
467	 mov	$a3, $t3
468	adc	8*3($a_ptr), $a3
469	test	$t4, $t4
470
471	cmovz	$t0, $a0
472	cmovz	$t1, $a1
473	mov	$a0, 8*0($r_ptr)
474	cmovz	$t2, $a2
475	mov	$a1, 8*1($r_ptr)
476	cmovz	$t3, $a3
477	mov	$a2, 8*2($r_ptr)
478	mov	$a3, 8*3($r_ptr)
479
480	mov	0(%rsp),%r13
481.cfi_restore	%r13
482	mov	8(%rsp),%r12
483.cfi_restore	%r12
484	lea	16(%rsp),%rsp
485.cfi_adjust_cfa_offset	-16
486.Lneg_epilogue:
487	ret
488.cfi_endproc
489.size	ecp_nistz256_neg,.-ecp_nistz256_neg
490___
491}
492{
493my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
494my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
495my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
496my ($poly1,$poly3)=($acc6,$acc7);
497
498$code.=<<___;
499################################################################################
500# void ecp_nistz256_ord_mul_mont(
501#   uint64_t res[4],
502#   uint64_t a[4],
503#   uint64_t b[4]);
504
505.globl	ecp_nistz256_ord_mul_mont
506.type	ecp_nistz256_ord_mul_mont,\@function,3
507.align	32
508ecp_nistz256_ord_mul_mont:
509.cfi_startproc
510___
511$code.=<<___	if ($addx);
512	mov	\$0x80100, %ecx
513	and	OPENSSL_ia32cap_P+8(%rip), %ecx
514	cmp	\$0x80100, %ecx
515	je	.Lecp_nistz256_ord_mul_montx
516___
517$code.=<<___;
518	push	%rbp
519.cfi_push	%rbp
520	push	%rbx
521.cfi_push	%rbx
522	push	%r12
523.cfi_push	%r12
524	push	%r13
525.cfi_push	%r13
526	push	%r14
527.cfi_push	%r14
528	push	%r15
529.cfi_push	%r15
530.Lord_mul_body:
531
532	mov	8*0($b_org), %rax
533	mov	$b_org, $b_ptr
534	lea	.Lord(%rip), %r14
535	mov	.LordK(%rip), %r15
536
537	################################# * b[0]
538	mov	%rax, $t0
539	mulq	8*0($a_ptr)
540	mov	%rax, $acc0
541	mov	$t0, %rax
542	mov	%rdx, $acc1
543
544	mulq	8*1($a_ptr)
545	add	%rax, $acc1
546	mov	$t0, %rax
547	adc	\$0, %rdx
548	mov	%rdx, $acc2
549
550	mulq	8*2($a_ptr)
551	add	%rax, $acc2
552	mov	$t0, %rax
553	adc	\$0, %rdx
554
555	 mov	$acc0, $acc5
556	 imulq	%r15,$acc0
557
558	mov	%rdx, $acc3
559	mulq	8*3($a_ptr)
560	add	%rax, $acc3
561	 mov	$acc0, %rax
562	adc	\$0, %rdx
563	mov	%rdx, $acc4
564
565	################################# First reduction step
566	mulq	8*0(%r14)
567	mov	$acc0, $t1
568	add	%rax, $acc5		# guaranteed to be zero
569	mov	$acc0, %rax
570	adc	\$0, %rdx
571	mov	%rdx, $t0
572
573	sub	$acc0, $acc2
574	sbb	\$0, $acc0		# can't borrow
575
576	mulq	8*1(%r14)
577	add	$t0, $acc1
578	adc	\$0, %rdx
579	add	%rax, $acc1
580	mov	$t1, %rax
581	adc	%rdx, $acc2
582	mov	$t1, %rdx
583	adc	\$0, $acc0		# can't overflow
584
585	shl	\$32, %rax
586	shr	\$32, %rdx
587	sub	%rax, $acc3
588	 mov	8*1($b_ptr), %rax
589	sbb	%rdx, $t1		# can't borrow
590
591	add	$acc0, $acc3
592	adc	$t1, $acc4
593	adc	\$0, $acc5
594
595	################################# * b[1]
596	mov	%rax, $t0
597	mulq	8*0($a_ptr)
598	add	%rax, $acc1
599	mov	$t0, %rax
600	adc	\$0, %rdx
601	mov	%rdx, $t1
602
603	mulq	8*1($a_ptr)
604	add	$t1, $acc2
605	adc	\$0, %rdx
606	add	%rax, $acc2
607	mov	$t0, %rax
608	adc	\$0, %rdx
609	mov	%rdx, $t1
610
611	mulq	8*2($a_ptr)
612	add	$t1, $acc3
613	adc	\$0, %rdx
614	add	%rax, $acc3
615	mov	$t0, %rax
616	adc	\$0, %rdx
617
618	 mov	$acc1, $t0
619	 imulq	%r15, $acc1
620
621	mov	%rdx, $t1
622	mulq	8*3($a_ptr)
623	add	$t1, $acc4
624	adc	\$0, %rdx
625	xor	$acc0, $acc0
626	add	%rax, $acc4
627	 mov	$acc1, %rax
628	adc	%rdx, $acc5
629	adc	\$0, $acc0
630
631	################################# Second reduction step
632	mulq	8*0(%r14)
633	mov	$acc1, $t1
634	add	%rax, $t0		# guaranteed to be zero
635	mov	$acc1, %rax
636	adc	%rdx, $t0
637
638	sub	$acc1, $acc3
639	sbb	\$0, $acc1		# can't borrow
640
641	mulq	8*1(%r14)
642	add	$t0, $acc2
643	adc	\$0, %rdx
644	add	%rax, $acc2
645	mov	$t1, %rax
646	adc	%rdx, $acc3
647	mov	$t1, %rdx
648	adc	\$0, $acc1		# can't overflow
649
650	shl	\$32, %rax
651	shr	\$32, %rdx
652	sub	%rax, $acc4
653	 mov	8*2($b_ptr), %rax
654	sbb	%rdx, $t1		# can't borrow
655
656	add	$acc1, $acc4
657	adc	$t1, $acc5
658	adc	\$0, $acc0
659
660	################################## * b[2]
661	mov	%rax, $t0
662	mulq	8*0($a_ptr)
663	add	%rax, $acc2
664	mov	$t0, %rax
665	adc	\$0, %rdx
666	mov	%rdx, $t1
667
668	mulq	8*1($a_ptr)
669	add	$t1, $acc3
670	adc	\$0, %rdx
671	add	%rax, $acc3
672	mov	$t0, %rax
673	adc	\$0, %rdx
674	mov	%rdx, $t1
675
676	mulq	8*2($a_ptr)
677	add	$t1, $acc4
678	adc	\$0, %rdx
679	add	%rax, $acc4
680	mov	$t0, %rax
681	adc	\$0, %rdx
682
683	 mov	$acc2, $t0
684	 imulq	%r15, $acc2
685
686	mov	%rdx, $t1
687	mulq	8*3($a_ptr)
688	add	$t1, $acc5
689	adc	\$0, %rdx
690	xor	$acc1, $acc1
691	add	%rax, $acc5
692	 mov	$acc2, %rax
693	adc	%rdx, $acc0
694	adc	\$0, $acc1
695
696	################################# Third reduction step
697	mulq	8*0(%r14)
698	mov	$acc2, $t1
699	add	%rax, $t0		# guaranteed to be zero
700	mov	$acc2, %rax
701	adc	%rdx, $t0
702
703	sub	$acc2, $acc4
704	sbb	\$0, $acc2		# can't borrow
705
706	mulq	8*1(%r14)
707	add	$t0, $acc3
708	adc	\$0, %rdx
709	add	%rax, $acc3
710	mov	$t1, %rax
711	adc	%rdx, $acc4
712	mov	$t1, %rdx
713	adc	\$0, $acc2		# can't overflow
714
715	shl	\$32, %rax
716	shr	\$32, %rdx
717	sub	%rax, $acc5
718	 mov	8*3($b_ptr), %rax
719	sbb	%rdx, $t1		# can't borrow
720
721	add	$acc2, $acc5
722	adc	$t1, $acc0
723	adc	\$0, $acc1
724
725	################################# * b[3]
726	mov	%rax, $t0
727	mulq	8*0($a_ptr)
728	add	%rax, $acc3
729	mov	$t0, %rax
730	adc	\$0, %rdx
731	mov	%rdx, $t1
732
733	mulq	8*1($a_ptr)
734	add	$t1, $acc4
735	adc	\$0, %rdx
736	add	%rax, $acc4
737	mov	$t0, %rax
738	adc	\$0, %rdx
739	mov	%rdx, $t1
740
741	mulq	8*2($a_ptr)
742	add	$t1, $acc5
743	adc	\$0, %rdx
744	add	%rax, $acc5
745	mov	$t0, %rax
746	adc	\$0, %rdx
747
748	 mov	$acc3, $t0
749	 imulq	%r15, $acc3
750
751	mov	%rdx, $t1
752	mulq	8*3($a_ptr)
753	add	$t1, $acc0
754	adc	\$0, %rdx
755	xor	$acc2, $acc2
756	add	%rax, $acc0
757	 mov	$acc3, %rax
758	adc	%rdx, $acc1
759	adc	\$0, $acc2
760
761	################################# Last reduction step
762	mulq	8*0(%r14)
763	mov	$acc3, $t1
764	add	%rax, $t0		# guaranteed to be zero
765	mov	$acc3, %rax
766	adc	%rdx, $t0
767
768	sub	$acc3, $acc5
769	sbb	\$0, $acc3		# can't borrow
770
771	mulq	8*1(%r14)
772	add	$t0, $acc4
773	adc	\$0, %rdx
774	add	%rax, $acc4
775	mov	$t1, %rax
776	adc	%rdx, $acc5
777	mov	$t1, %rdx
778	adc	\$0, $acc3		# can't overflow
779
780	shl	\$32, %rax
781	shr	\$32, %rdx
782	sub	%rax, $acc0
783	sbb	%rdx, $t1		# can't borrow
784
785	add	$acc3, $acc0
786	adc	$t1, $acc1
787	adc	\$0, $acc2
788
789	################################# Subtract ord
790	 mov	$acc4, $a_ptr
791	sub	8*0(%r14), $acc4
792	 mov	$acc5, $acc3
793	sbb	8*1(%r14), $acc5
794	 mov	$acc0, $t0
795	sbb	8*2(%r14), $acc0
796	 mov	$acc1, $t1
797	sbb	8*3(%r14), $acc1
798	sbb	\$0, $acc2
799
800	cmovc	$a_ptr, $acc4
801	cmovc	$acc3, $acc5
802	cmovc	$t0, $acc0
803	cmovc	$t1, $acc1
804
805	mov	$acc4, 8*0($r_ptr)
806	mov	$acc5, 8*1($r_ptr)
807	mov	$acc0, 8*2($r_ptr)
808	mov	$acc1, 8*3($r_ptr)
809
810	mov	0(%rsp),%r15
811.cfi_restore	%r15
812	mov	8(%rsp),%r14
813.cfi_restore	%r14
814	mov	16(%rsp),%r13
815.cfi_restore	%r13
816	mov	24(%rsp),%r12
817.cfi_restore	%r12
818	mov	32(%rsp),%rbx
819.cfi_restore	%rbx
820	mov	40(%rsp),%rbp
821.cfi_restore	%rbp
822	lea	48(%rsp),%rsp
823.cfi_adjust_cfa_offset	-48
824.Lord_mul_epilogue:
825	ret
826.cfi_endproc
827.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
828
829################################################################################
830# void ecp_nistz256_ord_sqr_mont(
831#   uint64_t res[4],
832#   uint64_t a[4],
833#   uint64_t rep);
834
835.globl	ecp_nistz256_ord_sqr_mont
836.type	ecp_nistz256_ord_sqr_mont,\@function,3
837.align	32
838ecp_nistz256_ord_sqr_mont:
839.cfi_startproc
840___
841$code.=<<___	if ($addx);
842	mov	\$0x80100, %ecx
843	and	OPENSSL_ia32cap_P+8(%rip), %ecx
844	cmp	\$0x80100, %ecx
845	je	.Lecp_nistz256_ord_sqr_montx
846___
847$code.=<<___;
848	push	%rbp
849.cfi_push	%rbp
850	push	%rbx
851.cfi_push	%rbx
852	push	%r12
853.cfi_push	%r12
854	push	%r13
855.cfi_push	%r13
856	push	%r14
857.cfi_push	%r14
858	push	%r15
859.cfi_push	%r15
860.Lord_sqr_body:
861
862	mov	8*0($a_ptr), $acc0
863	mov	8*1($a_ptr), %rax
864	mov	8*2($a_ptr), $acc6
865	mov	8*3($a_ptr), $acc7
866	lea	.Lord(%rip), $a_ptr	# pointer to modulus
867	mov	$b_org, $b_ptr
868	jmp	.Loop_ord_sqr
869
870.align	32
871.Loop_ord_sqr:
872	################################# a[1:] * a[0]
873	mov	%rax, $t1		# put aside a[1]
874	mul	$acc0			# a[1] * a[0]
875	mov	%rax, $acc1
876	movq	$t1, %xmm1		# offload a[1]
877	mov	$acc6, %rax
878	mov	%rdx, $acc2
879
880	mul	$acc0			# a[2] * a[0]
881	add	%rax, $acc2
882	mov	$acc7, %rax
883	movq	$acc6, %xmm2		# offload a[2]
884	adc	\$0, %rdx
885	mov	%rdx, $acc3
886
887	mul	$acc0			# a[3] * a[0]
888	add	%rax, $acc3
889	mov	$acc7, %rax
890	movq	$acc7, %xmm3		# offload a[3]
891	adc	\$0, %rdx
892	mov	%rdx, $acc4
893
894	################################# a[3] * a[2]
895	mul	$acc6			# a[3] * a[2]
896	mov	%rax, $acc5
897	mov	$acc6, %rax
898	mov	%rdx, $acc6
899
900	################################# a[2:] * a[1]
901	mul	$t1			# a[2] * a[1]
902	add	%rax, $acc3
903	mov	$acc7, %rax
904	adc	\$0, %rdx
905	mov	%rdx, $acc7
906
907	mul	$t1			# a[3] * a[1]
908	add	%rax, $acc4
909	adc	\$0, %rdx
910
911	add	$acc7, $acc4
912	adc	%rdx, $acc5
913	adc	\$0, $acc6		# can't overflow
914
915	################################# *2
916	xor	$acc7, $acc7
917	mov	$acc0, %rax
918	add	$acc1, $acc1
919	adc	$acc2, $acc2
920	adc	$acc3, $acc3
921	adc	$acc4, $acc4
922	adc	$acc5, $acc5
923	adc	$acc6, $acc6
924	adc	\$0, $acc7
925
926	################################# Missing products
927	mul	%rax			# a[0] * a[0]
928	mov	%rax, $acc0
929	movq	%xmm1, %rax
930	mov	%rdx, $t1
931
932	mul	%rax			# a[1] * a[1]
933	add	$t1, $acc1
934	adc	%rax, $acc2
935	movq	%xmm2, %rax
936	adc	\$0, %rdx
937	mov	%rdx, $t1
938
939	mul	%rax			# a[2] * a[2]
940	add	$t1, $acc3
941	adc	%rax, $acc4
942	movq	%xmm3, %rax
943	adc	\$0, %rdx
944	mov	%rdx, $t1
945
946	 mov	$acc0, $t0
947	 imulq	8*4($a_ptr), $acc0	# *= .LordK
948
949	mul	%rax			# a[3] * a[3]
950	add	$t1, $acc5
951	adc	%rax, $acc6
952	 mov	8*0($a_ptr), %rax	# modulus[0]
953	adc	%rdx, $acc7		# can't overflow
954
955	################################# First reduction step
956	mul	$acc0
957	mov	$acc0, $t1
958	add	%rax, $t0		# guaranteed to be zero
959	mov	8*1($a_ptr), %rax	# modulus[1]
960	adc	%rdx, $t0
961
962	sub	$acc0, $acc2
963	sbb	\$0, $t1		# can't borrow
964
965	mul	$acc0
966	add	$t0, $acc1
967	adc	\$0, %rdx
968	add	%rax, $acc1
969	mov	$acc0, %rax
970	adc	%rdx, $acc2
971	mov	$acc0, %rdx
972	adc	\$0, $t1		# can't overflow
973
974	 mov	$acc1, $t0
975	 imulq	8*4($a_ptr), $acc1	# *= .LordK
976
977	shl	\$32, %rax
978	shr	\$32, %rdx
979	sub	%rax, $acc3
980	 mov	8*0($a_ptr), %rax
981	sbb	%rdx, $acc0		# can't borrow
982
983	add	$t1, $acc3
984	adc	\$0, $acc0		# can't overflow
985
986	################################# Second reduction step
987	mul	$acc1
988	mov	$acc1, $t1
989	add	%rax, $t0		# guaranteed to be zero
990	mov	8*1($a_ptr), %rax
991	adc	%rdx, $t0
992
993	sub	$acc1, $acc3
994	sbb	\$0, $t1		# can't borrow
995
996	mul	$acc1
997	add	$t0, $acc2
998	adc	\$0, %rdx
999	add	%rax, $acc2
1000	mov	$acc1, %rax
1001	adc	%rdx, $acc3
1002	mov	$acc1, %rdx
1003	adc	\$0, $t1		# can't overflow
1004
1005	 mov	$acc2, $t0
1006	 imulq	8*4($a_ptr), $acc2	# *= .LordK
1007
1008	shl	\$32, %rax
1009	shr	\$32, %rdx
1010	sub	%rax, $acc0
1011	 mov	8*0($a_ptr), %rax
1012	sbb	%rdx, $acc1		# can't borrow
1013
1014	add	$t1, $acc0
1015	adc	\$0, $acc1		# can't overflow
1016
1017	################################# Third reduction step
1018	mul	$acc2
1019	mov	$acc2, $t1
1020	add	%rax, $t0		# guaranteed to be zero
1021	mov	8*1($a_ptr), %rax
1022	adc	%rdx, $t0
1023
1024	sub	$acc2, $acc0
1025	sbb	\$0, $t1		# can't borrow
1026
1027	mul	$acc2
1028	add	$t0, $acc3
1029	adc	\$0, %rdx
1030	add	%rax, $acc3
1031	mov	$acc2, %rax
1032	adc	%rdx, $acc0
1033	mov	$acc2, %rdx
1034	adc	\$0, $t1		# can't overflow
1035
1036	 mov	$acc3, $t0
1037	 imulq	8*4($a_ptr), $acc3	# *= .LordK
1038
1039	shl	\$32, %rax
1040	shr	\$32, %rdx
1041	sub	%rax, $acc1
1042	 mov	8*0($a_ptr), %rax
1043	sbb	%rdx, $acc2		# can't borrow
1044
1045	add	$t1, $acc1
1046	adc	\$0, $acc2		# can't overflow
1047
1048	################################# Last reduction step
1049	mul	$acc3
1050	mov	$acc3, $t1
1051	add	%rax, $t0		# guaranteed to be zero
1052	mov	8*1($a_ptr), %rax
1053	adc	%rdx, $t0
1054
1055	sub	$acc3, $acc1
1056	sbb	\$0, $t1		# can't borrow
1057
1058	mul	$acc3
1059	add	$t0, $acc0
1060	adc	\$0, %rdx
1061	add	%rax, $acc0
1062	mov	$acc3, %rax
1063	adc	%rdx, $acc1
1064	mov	$acc3, %rdx
1065	adc	\$0, $t1		# can't overflow
1066
1067	shl	\$32, %rax
1068	shr	\$32, %rdx
1069	sub	%rax, $acc2
1070	sbb	%rdx, $acc3		# can't borrow
1071
1072	add	$t1, $acc2
1073	adc	\$0, $acc3		# can't overflow
1074
1075	################################# Add bits [511:256] of the sqr result
1076	xor	%rdx, %rdx
1077	add	$acc4, $acc0
1078	adc	$acc5, $acc1
1079	 mov	$acc0, $acc4
1080	adc	$acc6, $acc2
1081	adc	$acc7, $acc3
1082	 mov	$acc1, %rax
1083	adc	\$0, %rdx
1084
1085	################################# Compare to modulus
1086	sub	8*0($a_ptr), $acc0
1087	 mov	$acc2, $acc6
1088	sbb	8*1($a_ptr), $acc1
1089	sbb	8*2($a_ptr), $acc2
1090	 mov	$acc3, $acc7
1091	sbb	8*3($a_ptr), $acc3
1092	sbb	\$0, %rdx
1093
1094	cmovc	$acc4, $acc0
1095	cmovnc	$acc1, %rax
1096	cmovnc	$acc2, $acc6
1097	cmovnc	$acc3, $acc7
1098
1099	dec	$b_ptr
1100	jnz	.Loop_ord_sqr
1101
1102	mov	$acc0, 8*0($r_ptr)
1103	mov	%rax,  8*1($r_ptr)
1104	pxor	%xmm1, %xmm1
1105	mov	$acc6, 8*2($r_ptr)
1106	pxor	%xmm2, %xmm2
1107	mov	$acc7, 8*3($r_ptr)
1108	pxor	%xmm3, %xmm3
1109
1110	mov	0(%rsp),%r15
1111.cfi_restore	%r15
1112	mov	8(%rsp),%r14
1113.cfi_restore	%r14
1114	mov	16(%rsp),%r13
1115.cfi_restore	%r13
1116	mov	24(%rsp),%r12
1117.cfi_restore	%r12
1118	mov	32(%rsp),%rbx
1119.cfi_restore	%rbx
1120	mov	40(%rsp),%rbp
1121.cfi_restore	%rbp
1122	lea	48(%rsp),%rsp
1123.cfi_adjust_cfa_offset	-48
1124.Lord_sqr_epilogue:
1125	ret
1126.cfi_endproc
1127.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1128___
1129
1130$code.=<<___	if ($addx);
1131################################################################################
1132.type	ecp_nistz256_ord_mul_montx,\@function,3
1133.align	32
1134ecp_nistz256_ord_mul_montx:
1135.cfi_startproc
1136.Lecp_nistz256_ord_mul_montx:
1137	push	%rbp
1138.cfi_push	%rbp
1139	push	%rbx
1140.cfi_push	%rbx
1141	push	%r12
1142.cfi_push	%r12
1143	push	%r13
1144.cfi_push	%r13
1145	push	%r14
1146.cfi_push	%r14
1147	push	%r15
1148.cfi_push	%r15
1149.Lord_mulx_body:
1150
1151	mov	$b_org, $b_ptr
1152	mov	8*0($b_org), %rdx
1153	mov	8*0($a_ptr), $acc1
1154	mov	8*1($a_ptr), $acc2
1155	mov	8*2($a_ptr), $acc3
1156	mov	8*3($a_ptr), $acc4
1157	lea	-128($a_ptr), $a_ptr	# control u-op density
1158	lea	.Lord-128(%rip), %r14
1159	mov	.LordK(%rip), %r15
1160
1161	################################# Multiply by b[0]
1162	mulx	$acc1, $acc0, $acc1
1163	mulx	$acc2, $t0, $acc2
1164	mulx	$acc3, $t1, $acc3
1165	add	$t0, $acc1
1166	mulx	$acc4, $t0, $acc4
1167	 mov	$acc0, %rdx
1168	 mulx	%r15, %rdx, %rax
1169	adc	$t1, $acc2
1170	adc	$t0, $acc3
1171	adc	\$0, $acc4
1172
1173	################################# reduction
1174	xor	$acc5, $acc5		# $acc5=0, cf=0, of=0
1175	mulx	8*0+128(%r14), $t0, $t1
1176	adcx	$t0, $acc0		# guaranteed to be zero
1177	adox	$t1, $acc1
1178
1179	mulx	8*1+128(%r14), $t0, $t1
1180	adcx	$t0, $acc1
1181	adox	$t1, $acc2
1182
1183	mulx	8*2+128(%r14), $t0, $t1
1184	adcx	$t0, $acc2
1185	adox	$t1, $acc3
1186
1187	mulx	8*3+128(%r14), $t0, $t1
1188	 mov	8*1($b_ptr), %rdx
1189	adcx	$t0, $acc3
1190	adox	$t1, $acc4
1191	adcx	$acc0, $acc4
1192	adox	$acc0, $acc5
1193	adc	\$0, $acc5		# cf=0, of=0
1194
1195	################################# Multiply by b[1]
1196	mulx	8*0+128($a_ptr), $t0, $t1
1197	adcx	$t0, $acc1
1198	adox	$t1, $acc2
1199
1200	mulx	8*1+128($a_ptr), $t0, $t1
1201	adcx	$t0, $acc2
1202	adox	$t1, $acc3
1203
1204	mulx	8*2+128($a_ptr), $t0, $t1
1205	adcx	$t0, $acc3
1206	adox	$t1, $acc4
1207
1208	mulx	8*3+128($a_ptr), $t0, $t1
1209	 mov	$acc1, %rdx
1210	 mulx	%r15, %rdx, %rax
1211	adcx	$t0, $acc4
1212	adox	$t1, $acc5
1213
1214	adcx	$acc0, $acc5
1215	adox	$acc0, $acc0
1216	adc	\$0, $acc0		# cf=0, of=0
1217
1218	################################# reduction
1219	mulx	8*0+128(%r14), $t0, $t1
1220	adcx	$t0, $acc1		# guaranteed to be zero
1221	adox	$t1, $acc2
1222
1223	mulx	8*1+128(%r14), $t0, $t1
1224	adcx	$t0, $acc2
1225	adox	$t1, $acc3
1226
1227	mulx	8*2+128(%r14), $t0, $t1
1228	adcx	$t0, $acc3
1229	adox	$t1, $acc4
1230
1231	mulx	8*3+128(%r14), $t0, $t1
1232	 mov	8*2($b_ptr), %rdx
1233	adcx	$t0, $acc4
1234	adox	$t1, $acc5
1235	adcx	$acc1, $acc5
1236	adox	$acc1, $acc0
1237	adc	\$0, $acc0		# cf=0, of=0
1238
1239	################################# Multiply by b[2]
1240	mulx	8*0+128($a_ptr), $t0, $t1
1241	adcx	$t0, $acc2
1242	adox	$t1, $acc3
1243
1244	mulx	8*1+128($a_ptr), $t0, $t1
1245	adcx	$t0, $acc3
1246	adox	$t1, $acc4
1247
1248	mulx	8*2+128($a_ptr), $t0, $t1
1249	adcx	$t0, $acc4
1250	adox	$t1, $acc5
1251
1252	mulx	8*3+128($a_ptr), $t0, $t1
1253	 mov	$acc2, %rdx
1254	 mulx	%r15, %rdx, %rax
1255	adcx	$t0, $acc5
1256	adox	$t1, $acc0
1257
1258	adcx	$acc1, $acc0
1259	adox	$acc1, $acc1
1260	adc	\$0, $acc1		# cf=0, of=0
1261
1262	################################# reduction
1263	mulx	8*0+128(%r14), $t0, $t1
1264	adcx	$t0, $acc2		# guaranteed to be zero
1265	adox	$t1, $acc3
1266
1267	mulx	8*1+128(%r14), $t0, $t1
1268	adcx	$t0, $acc3
1269	adox	$t1, $acc4
1270
1271	mulx	8*2+128(%r14), $t0, $t1
1272	adcx	$t0, $acc4
1273	adox	$t1, $acc5
1274
1275	mulx	8*3+128(%r14), $t0, $t1
1276	 mov	8*3($b_ptr), %rdx
1277	adcx	$t0, $acc5
1278	adox	$t1, $acc0
1279	adcx	$acc2, $acc0
1280	adox	$acc2, $acc1
1281	adc	\$0, $acc1		# cf=0, of=0
1282
1283	################################# Multiply by b[3]
1284	mulx	8*0+128($a_ptr), $t0, $t1
1285	adcx	$t0, $acc3
1286	adox	$t1, $acc4
1287
1288	mulx	8*1+128($a_ptr), $t0, $t1
1289	adcx	$t0, $acc4
1290	adox	$t1, $acc5
1291
1292	mulx	8*2+128($a_ptr), $t0, $t1
1293	adcx	$t0, $acc5
1294	adox	$t1, $acc0
1295
1296	mulx	8*3+128($a_ptr), $t0, $t1
1297	 mov	$acc3, %rdx
1298	 mulx	%r15, %rdx, %rax
1299	adcx	$t0, $acc0
1300	adox	$t1, $acc1
1301
1302	adcx	$acc2, $acc1
1303	adox	$acc2, $acc2
1304	adc	\$0, $acc2		# cf=0, of=0
1305
1306	################################# reduction
1307	mulx	8*0+128(%r14), $t0, $t1
1308	adcx	$t0, $acc3		# guaranteed to be zero
1309	adox	$t1, $acc4
1310
1311	mulx	8*1+128(%r14), $t0, $t1
1312	adcx	$t0, $acc4
1313	adox	$t1, $acc5
1314
1315	mulx	8*2+128(%r14), $t0, $t1
1316	adcx	$t0, $acc5
1317	adox	$t1, $acc0
1318
1319	mulx	8*3+128(%r14), $t0, $t1
1320	lea	128(%r14),%r14
1321	 mov	$acc4, $t2
1322	adcx	$t0, $acc0
1323	adox	$t1, $acc1
1324	 mov	$acc5, $t3
1325	adcx	$acc3, $acc1
1326	adox	$acc3, $acc2
1327	adc	\$0, $acc2
1328
1329	#################################
1330	# Branch-less conditional subtraction of P
1331	 mov	$acc0, $t0
1332	sub	8*0(%r14), $acc4
1333	sbb	8*1(%r14), $acc5
1334	sbb	8*2(%r14), $acc0
1335	 mov	$acc1, $t1
1336	sbb	8*3(%r14), $acc1
1337	sbb	\$0, $acc2
1338
1339	cmovc	$t2, $acc4
1340	cmovc	$t3, $acc5
1341	cmovc	$t0, $acc0
1342	cmovc	$t1, $acc1
1343
1344	mov	$acc4, 8*0($r_ptr)
1345	mov	$acc5, 8*1($r_ptr)
1346	mov	$acc0, 8*2($r_ptr)
1347	mov	$acc1, 8*3($r_ptr)
1348
1349	mov	0(%rsp),%r15
1350.cfi_restore	%r15
1351	mov	8(%rsp),%r14
1352.cfi_restore	%r14
1353	mov	16(%rsp),%r13
1354.cfi_restore	%r13
1355	mov	24(%rsp),%r12
1356.cfi_restore	%r12
1357	mov	32(%rsp),%rbx
1358.cfi_restore	%rbx
1359	mov	40(%rsp),%rbp
1360.cfi_restore	%rbp
1361	lea	48(%rsp),%rsp
1362.cfi_adjust_cfa_offset	-48
1363.Lord_mulx_epilogue:
1364	ret
1365.cfi_endproc
1366.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
1367
1368.type	ecp_nistz256_ord_sqr_montx,\@function,3
1369.align	32
1370ecp_nistz256_ord_sqr_montx:
1371.cfi_startproc
1372.Lecp_nistz256_ord_sqr_montx:
1373	push	%rbp
1374.cfi_push	%rbp
1375	push	%rbx
1376.cfi_push	%rbx
1377	push	%r12
1378.cfi_push	%r12
1379	push	%r13
1380.cfi_push	%r13
1381	push	%r14
1382.cfi_push	%r14
1383	push	%r15
1384.cfi_push	%r15
1385.Lord_sqrx_body:
1386
1387	mov	$b_org, $b_ptr
1388	mov	8*0($a_ptr), %rdx
1389	mov	8*1($a_ptr), $acc6
1390	mov	8*2($a_ptr), $acc7
1391	mov	8*3($a_ptr), $acc0
1392	lea	.Lord(%rip), $a_ptr
1393	jmp	.Loop_ord_sqrx
1394
1395.align	32
1396.Loop_ord_sqrx:
1397	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
1398	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
1399	 mov	%rdx, %rax		# offload a[0]
1400	 movq	$acc6, %xmm1		# offload a[1]
1401	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
1402	 mov	$acc6, %rdx
1403	add	$t0, $acc2
1404	 movq	$acc7, %xmm2		# offload a[2]
1405	adc	$t1, $acc3
1406	adc	\$0, $acc4
1407	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
1408	#################################
1409	mulx	$acc7, $t0, $t1		# a[1]*a[2]
1410	adcx	$t0, $acc3
1411	adox	$t1, $acc4
1412
1413	mulx	$acc0, $t0, $t1		# a[1]*a[3]
1414	 mov	$acc7, %rdx
1415	adcx	$t0, $acc4
1416	adox	$t1, $acc5
1417	adc	\$0, $acc5
1418	#################################
1419	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
1420	mov	%rax, %rdx
1421	 movq	$acc0, %xmm3		# offload a[3]
1422	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
1423	 adcx	$acc1, $acc1		# acc1:6<<1
1424	adox	$t0, $acc5
1425	 adcx	$acc2, $acc2
1426	adox	$acc7, $acc6		# of=0
1427
1428	################################# a[i]*a[i]
1429	mulx	%rdx, $acc0, $t1
1430	movq	%xmm1, %rdx
1431	 adcx	$acc3, $acc3
1432	adox	$t1, $acc1
1433	 adcx	$acc4, $acc4
1434	mulx	%rdx, $t0, $t4
1435	movq	%xmm2, %rdx
1436	 adcx	$acc5, $acc5
1437	adox	$t0, $acc2
1438	 adcx	$acc6, $acc6
1439	mulx	%rdx, $t0, $t1
1440	.byte	0x67
1441	movq	%xmm3, %rdx
1442	adox	$t4, $acc3
1443	 adcx	$acc7, $acc7
1444	adox	$t0, $acc4
1445	adox	$t1, $acc5
1446	mulx	%rdx, $t0, $t4
1447	adox	$t0, $acc6
1448	adox	$t4, $acc7
1449
1450	################################# reduction
1451	mov	$acc0, %rdx
1452	mulx	8*4($a_ptr), %rdx, $t0
1453
1454	xor	%rax, %rax		# cf=0, of=0
1455	mulx	8*0($a_ptr), $t0, $t1
1456	adcx	$t0, $acc0		# guaranteed to be zero
1457	adox	$t1, $acc1
1458	mulx	8*1($a_ptr), $t0, $t1
1459	adcx	$t0, $acc1
1460	adox	$t1, $acc2
1461	mulx	8*2($a_ptr), $t0, $t1
1462	adcx	$t0, $acc2
1463	adox	$t1, $acc3
1464	mulx	8*3($a_ptr), $t0, $t1
1465	adcx	$t0, $acc3
1466	adox	$t1, $acc0		# of=0
1467	adcx	%rax, $acc0		# cf=0
1468
1469	#################################
1470	mov	$acc1, %rdx
1471	mulx	8*4($a_ptr), %rdx, $t0
1472
1473	mulx	8*0($a_ptr), $t0, $t1
1474	adox	$t0, $acc1		# guaranteed to be zero
1475	adcx	$t1, $acc2
1476	mulx	8*1($a_ptr), $t0, $t1
1477	adox	$t0, $acc2
1478	adcx	$t1, $acc3
1479	mulx	8*2($a_ptr), $t0, $t1
1480	adox	$t0, $acc3
1481	adcx	$t1, $acc0
1482	mulx	8*3($a_ptr), $t0, $t1
1483	adox	$t0, $acc0
1484	adcx	$t1, $acc1		# cf=0
1485	adox	%rax, $acc1		# of=0
1486
1487	#################################
1488	mov	$acc2, %rdx
1489	mulx	8*4($a_ptr), %rdx, $t0
1490
1491	mulx	8*0($a_ptr), $t0, $t1
1492	adcx	$t0, $acc2		# guaranteed to be zero
1493	adox	$t1, $acc3
1494	mulx	8*1($a_ptr), $t0, $t1
1495	adcx	$t0, $acc3
1496	adox	$t1, $acc0
1497	mulx	8*2($a_ptr), $t0, $t1
1498	adcx	$t0, $acc0
1499	adox	$t1, $acc1
1500	mulx	8*3($a_ptr), $t0, $t1
1501	adcx	$t0, $acc1
1502	adox	$t1, $acc2		# of=0
1503	adcx	%rax, $acc2		# cf=0
1504
1505	#################################
1506	mov	$acc3, %rdx
1507	mulx	8*4($a_ptr), %rdx, $t0
1508
1509	mulx	8*0($a_ptr), $t0, $t1
1510	adox	$t0, $acc3		# guaranteed to be zero
1511	adcx	$t1, $acc0
1512	mulx	8*1($a_ptr), $t0, $t1
1513	adox	$t0, $acc0
1514	adcx	$t1, $acc1
1515	mulx	8*2($a_ptr), $t0, $t1
1516	adox	$t0, $acc1
1517	adcx	$t1, $acc2
1518	mulx	8*3($a_ptr), $t0, $t1
1519	adox	$t0, $acc2
1520	adcx	$t1, $acc3
1521	adox	%rax, $acc3
1522
1523	################################# accumulate upper half
1524	add	$acc0, $acc4		# add	$acc4, $acc0
1525	adc	$acc5, $acc1
1526	 mov	$acc4, %rdx
1527	adc	$acc6, $acc2
1528	adc	$acc7, $acc3
1529	 mov	$acc1, $acc6
1530	adc	\$0, %rax
1531
1532	################################# compare to modulus
1533	sub	8*0($a_ptr), $acc4
1534	 mov	$acc2, $acc7
1535	sbb	8*1($a_ptr), $acc1
1536	sbb	8*2($a_ptr), $acc2
1537	 mov	$acc3, $acc0
1538	sbb	8*3($a_ptr), $acc3
1539	sbb	\$0, %rax
1540
1541	cmovnc	$acc4, %rdx
1542	cmovnc	$acc1, $acc6
1543	cmovnc	$acc2, $acc7
1544	cmovnc	$acc3, $acc0
1545
1546	dec	$b_ptr
1547	jnz	.Loop_ord_sqrx
1548
1549	mov	%rdx, 8*0($r_ptr)
1550	mov	$acc6, 8*1($r_ptr)
1551	pxor	%xmm1, %xmm1
1552	mov	$acc7, 8*2($r_ptr)
1553	pxor	%xmm2, %xmm2
1554	mov	$acc0, 8*3($r_ptr)
1555	pxor	%xmm3, %xmm3
1556
1557	mov	0(%rsp),%r15
1558.cfi_restore	%r15
1559	mov	8(%rsp),%r14
1560.cfi_restore	%r14
1561	mov	16(%rsp),%r13
1562.cfi_restore	%r13
1563	mov	24(%rsp),%r12
1564.cfi_restore	%r12
1565	mov	32(%rsp),%rbx
1566.cfi_restore	%rbx
1567	mov	40(%rsp),%rbp
1568.cfi_restore	%rbp
1569	lea	48(%rsp),%rsp
1570.cfi_adjust_cfa_offset	-48
1571.Lord_sqrx_epilogue:
1572	ret
1573.cfi_endproc
1574.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
1575___
1576
1577$code.=<<___;
1578################################################################################
1579# void ecp_nistz256_to_mont(
1580#   uint64_t res[4],
1581#   uint64_t in[4]);
1582.globl	ecp_nistz256_to_mont
1583.type	ecp_nistz256_to_mont,\@function,2
1584.align	32
1585ecp_nistz256_to_mont:
1586.cfi_startproc
1587___
1588$code.=<<___	if ($addx);
1589	mov	\$0x80100, %ecx
1590	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1591___
1592$code.=<<___;
1593	lea	.LRR(%rip), $b_org
1594	jmp	.Lmul_mont
1595.cfi_endproc
1596.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
1597
1598################################################################################
1599# void ecp_nistz256_mul_mont(
1600#   uint64_t res[4],
1601#   uint64_t a[4],
1602#   uint64_t b[4]);
1603
1604.globl	ecp_nistz256_mul_mont
1605.type	ecp_nistz256_mul_mont,\@function,3
1606.align	32
1607ecp_nistz256_mul_mont:
1608.cfi_startproc
1609___
1610$code.=<<___	if ($addx);
1611	mov	\$0x80100, %ecx
1612	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1613___
1614$code.=<<___;
1615.Lmul_mont:
1616	push	%rbp
1617.cfi_push	%rbp
1618	push	%rbx
1619.cfi_push	%rbx
1620	push	%r12
1621.cfi_push	%r12
1622	push	%r13
1623.cfi_push	%r13
1624	push	%r14
1625.cfi_push	%r14
1626	push	%r15
1627.cfi_push	%r15
1628.Lmul_body:
1629___
1630$code.=<<___	if ($addx);
1631	cmp	\$0x80100, %ecx
1632	je	.Lmul_montx
1633___
1634$code.=<<___;
1635	mov	$b_org, $b_ptr
1636	mov	8*0($b_org), %rax
1637	mov	8*0($a_ptr), $acc1
1638	mov	8*1($a_ptr), $acc2
1639	mov	8*2($a_ptr), $acc3
1640	mov	8*3($a_ptr), $acc4
1641
1642	call	__ecp_nistz256_mul_montq
1643___
1644$code.=<<___	if ($addx);
1645	jmp	.Lmul_mont_done
1646
1647.align	32
1648.Lmul_montx:
1649	mov	$b_org, $b_ptr
1650	mov	8*0($b_org), %rdx
1651	mov	8*0($a_ptr), $acc1
1652	mov	8*1($a_ptr), $acc2
1653	mov	8*2($a_ptr), $acc3
1654	mov	8*3($a_ptr), $acc4
1655	lea	-128($a_ptr), $a_ptr	# control u-op density
1656
1657	call	__ecp_nistz256_mul_montx
1658___
1659$code.=<<___;
1660.Lmul_mont_done:
1661	mov	0(%rsp),%r15
1662.cfi_restore	%r15
1663	mov	8(%rsp),%r14
1664.cfi_restore	%r14
1665	mov	16(%rsp),%r13
1666.cfi_restore	%r13
1667	mov	24(%rsp),%r12
1668.cfi_restore	%r12
1669	mov	32(%rsp),%rbx
1670.cfi_restore	%rbx
1671	mov	40(%rsp),%rbp
1672.cfi_restore	%rbp
1673	lea	48(%rsp),%rsp
1674.cfi_adjust_cfa_offset	-48
1675.Lmul_epilogue:
1676	ret
1677.cfi_endproc
1678.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
1679
1680.type	__ecp_nistz256_mul_montq,\@abi-omnipotent
1681.align	32
1682__ecp_nistz256_mul_montq:
1683.cfi_startproc
1684	########################################################################
1685	# Multiply a by b[0]
1686	mov	%rax, $t1
1687	mulq	$acc1
1688	mov	.Lpoly+8*1(%rip),$poly1
1689	mov	%rax, $acc0
1690	mov	$t1, %rax
1691	mov	%rdx, $acc1
1692
1693	mulq	$acc2
1694	mov	.Lpoly+8*3(%rip),$poly3
1695	add	%rax, $acc1
1696	mov	$t1, %rax
1697	adc	\$0, %rdx
1698	mov	%rdx, $acc2
1699
1700	mulq	$acc3
1701	add	%rax, $acc2
1702	mov	$t1, %rax
1703	adc	\$0, %rdx
1704	mov	%rdx, $acc3
1705
1706	mulq	$acc4
1707	add	%rax, $acc3
1708	 mov	$acc0, %rax
1709	adc	\$0, %rdx
1710	xor	$acc5, $acc5
1711	mov	%rdx, $acc4
1712
1713	########################################################################
1714	# First reduction step
1715	# Basically now we want to multiply acc[0] by p256,
1716	# and add the result to the acc.
1717	# Due to the special form of p256 we do some optimizations
1718	#
1719	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
1720	# then we add acc[0] and get acc[0] x 2^96
1721
1722	mov	$acc0, $t1
1723	shl	\$32, $acc0
1724	mulq	$poly3
1725	shr	\$32, $t1
1726	add	$acc0, $acc1		# +=acc[0]<<96
1727	adc	$t1, $acc2
1728	adc	%rax, $acc3
1729	 mov	8*1($b_ptr), %rax
1730	adc	%rdx, $acc4
1731	adc	\$0, $acc5
1732	xor	$acc0, $acc0
1733
1734	########################################################################
1735	# Multiply by b[1]
1736	mov	%rax, $t1
1737	mulq	8*0($a_ptr)
1738	add	%rax, $acc1
1739	mov	$t1, %rax
1740	adc	\$0, %rdx
1741	mov	%rdx, $t0
1742
1743	mulq	8*1($a_ptr)
1744	add	$t0, $acc2
1745	adc	\$0, %rdx
1746	add	%rax, $acc2
1747	mov	$t1, %rax
1748	adc	\$0, %rdx
1749	mov	%rdx, $t0
1750
1751	mulq	8*2($a_ptr)
1752	add	$t0, $acc3
1753	adc	\$0, %rdx
1754	add	%rax, $acc3
1755	mov	$t1, %rax
1756	adc	\$0, %rdx
1757	mov	%rdx, $t0
1758
1759	mulq	8*3($a_ptr)
1760	add	$t0, $acc4
1761	adc	\$0, %rdx
1762	add	%rax, $acc4
1763	 mov	$acc1, %rax
1764	adc	%rdx, $acc5
1765	adc	\$0, $acc0
1766
1767	########################################################################
1768	# Second reduction step
1769	mov	$acc1, $t1
1770	shl	\$32, $acc1
1771	mulq	$poly3
1772	shr	\$32, $t1
1773	add	$acc1, $acc2
1774	adc	$t1, $acc3
1775	adc	%rax, $acc4
1776	 mov	8*2($b_ptr), %rax
1777	adc	%rdx, $acc5
1778	adc	\$0, $acc0
1779	xor	$acc1, $acc1
1780
1781	########################################################################
1782	# Multiply by b[2]
1783	mov	%rax, $t1
1784	mulq	8*0($a_ptr)
1785	add	%rax, $acc2
1786	mov	$t1, %rax
1787	adc	\$0, %rdx
1788	mov	%rdx, $t0
1789
1790	mulq	8*1($a_ptr)
1791	add	$t0, $acc3
1792	adc	\$0, %rdx
1793	add	%rax, $acc3
1794	mov	$t1, %rax
1795	adc	\$0, %rdx
1796	mov	%rdx, $t0
1797
1798	mulq	8*2($a_ptr)
1799	add	$t0, $acc4
1800	adc	\$0, %rdx
1801	add	%rax, $acc4
1802	mov	$t1, %rax
1803	adc	\$0, %rdx
1804	mov	%rdx, $t0
1805
1806	mulq	8*3($a_ptr)
1807	add	$t0, $acc5
1808	adc	\$0, %rdx
1809	add	%rax, $acc5
1810	 mov	$acc2, %rax
1811	adc	%rdx, $acc0
1812	adc	\$0, $acc1
1813
1814	########################################################################
1815	# Third reduction step
1816	mov	$acc2, $t1
1817	shl	\$32, $acc2
1818	mulq	$poly3
1819	shr	\$32, $t1
1820	add	$acc2, $acc3
1821	adc	$t1, $acc4
1822	adc	%rax, $acc5
1823	 mov	8*3($b_ptr), %rax
1824	adc	%rdx, $acc0
1825	adc	\$0, $acc1
1826	xor	$acc2, $acc2
1827
1828	########################################################################
1829	# Multiply by b[3]
1830	mov	%rax, $t1
1831	mulq	8*0($a_ptr)
1832	add	%rax, $acc3
1833	mov	$t1, %rax
1834	adc	\$0, %rdx
1835	mov	%rdx, $t0
1836
1837	mulq	8*1($a_ptr)
1838	add	$t0, $acc4
1839	adc	\$0, %rdx
1840	add	%rax, $acc4
1841	mov	$t1, %rax
1842	adc	\$0, %rdx
1843	mov	%rdx, $t0
1844
1845	mulq	8*2($a_ptr)
1846	add	$t0, $acc5
1847	adc	\$0, %rdx
1848	add	%rax, $acc5
1849	mov	$t1, %rax
1850	adc	\$0, %rdx
1851	mov	%rdx, $t0
1852
1853	mulq	8*3($a_ptr)
1854	add	$t0, $acc0
1855	adc	\$0, %rdx
1856	add	%rax, $acc0
1857	 mov	$acc3, %rax
1858	adc	%rdx, $acc1
1859	adc	\$0, $acc2
1860
1861	########################################################################
1862	# Final reduction step
1863	mov	$acc3, $t1
1864	shl	\$32, $acc3
1865	mulq	$poly3
1866	shr	\$32, $t1
1867	add	$acc3, $acc4
1868	adc	$t1, $acc5
1869	 mov	$acc4, $t0
1870	adc	%rax, $acc0
1871	adc	%rdx, $acc1
1872	 mov	$acc5, $t1
1873	adc	\$0, $acc2
1874
1875	########################################################################
1876	# Branch-less conditional subtraction of P
1877	sub	\$-1, $acc4		# .Lpoly[0]
1878	 mov	$acc0, $t2
1879	sbb	$poly1, $acc5		# .Lpoly[1]
1880	sbb	\$0, $acc0		# .Lpoly[2]
1881	 mov	$acc1, $t3
1882	sbb	$poly3, $acc1		# .Lpoly[3]
1883	sbb	\$0, $acc2
1884
1885	cmovc	$t0, $acc4
1886	cmovc	$t1, $acc5
1887	mov	$acc4, 8*0($r_ptr)
1888	cmovc	$t2, $acc0
1889	mov	$acc5, 8*1($r_ptr)
1890	cmovc	$t3, $acc1
1891	mov	$acc0, 8*2($r_ptr)
1892	mov	$acc1, 8*3($r_ptr)
1893
1894	ret
1895.cfi_endproc
1896.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
1897
1898################################################################################
1899# void ecp_nistz256_sqr_mont(
1900#   uint64_t res[4],
1901#   uint64_t a[4]);
1902
1903# we optimize the square according to S.Gueron and V.Krasnov,
1904# "Speeding up Big-Number Squaring"
1905.globl	ecp_nistz256_sqr_mont
1906.type	ecp_nistz256_sqr_mont,\@function,2
1907.align	32
1908ecp_nistz256_sqr_mont:
1909.cfi_startproc
1910___
1911$code.=<<___	if ($addx);
1912	mov	\$0x80100, %ecx
1913	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1914___
1915$code.=<<___;
1916	push	%rbp
1917.cfi_push	%rbp
1918	push	%rbx
1919.cfi_push	%rbx
1920	push	%r12
1921.cfi_push	%r12
1922	push	%r13
1923.cfi_push	%r13
1924	push	%r14
1925.cfi_push	%r14
1926	push	%r15
1927.cfi_push	%r15
1928.Lsqr_body:
1929___
1930$code.=<<___	if ($addx);
1931	cmp	\$0x80100, %ecx
1932	je	.Lsqr_montx
1933___
1934$code.=<<___;
1935	mov	8*0($a_ptr), %rax
1936	mov	8*1($a_ptr), $acc6
1937	mov	8*2($a_ptr), $acc7
1938	mov	8*3($a_ptr), $acc0
1939
1940	call	__ecp_nistz256_sqr_montq
1941___
1942$code.=<<___	if ($addx);
1943	jmp	.Lsqr_mont_done
1944
1945.align	32
1946.Lsqr_montx:
1947	mov	8*0($a_ptr), %rdx
1948	mov	8*1($a_ptr), $acc6
1949	mov	8*2($a_ptr), $acc7
1950	mov	8*3($a_ptr), $acc0
1951	lea	-128($a_ptr), $a_ptr	# control u-op density
1952
1953	call	__ecp_nistz256_sqr_montx
1954___
1955$code.=<<___;
1956.Lsqr_mont_done:
1957	mov	0(%rsp),%r15
1958.cfi_restore	%r15
1959	mov	8(%rsp),%r14
1960.cfi_restore	%r14
1961	mov	16(%rsp),%r13
1962.cfi_restore	%r13
1963	mov	24(%rsp),%r12
1964.cfi_restore	%r12
1965	mov	32(%rsp),%rbx
1966.cfi_restore	%rbx
1967	mov	40(%rsp),%rbp
1968.cfi_restore	%rbp
1969	lea	48(%rsp),%rsp
1970.cfi_adjust_cfa_offset	-48
1971.Lsqr_epilogue:
1972	ret
1973.cfi_endproc
1974.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
1975
1976.type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
1977.align	32
1978__ecp_nistz256_sqr_montq:
1979.cfi_startproc
1980	mov	%rax, $acc5
1981	mulq	$acc6			# a[1]*a[0]
1982	mov	%rax, $acc1
1983	mov	$acc7, %rax
1984	mov	%rdx, $acc2
1985
1986	mulq	$acc5			# a[0]*a[2]
1987	add	%rax, $acc2
1988	mov	$acc0, %rax
1989	adc	\$0, %rdx
1990	mov	%rdx, $acc3
1991
1992	mulq	$acc5			# a[0]*a[3]
1993	add	%rax, $acc3
1994	 mov	$acc7, %rax
1995	adc	\$0, %rdx
1996	mov	%rdx, $acc4
1997
1998	#################################
1999	mulq	$acc6			# a[1]*a[2]
2000	add	%rax, $acc3
2001	mov	$acc0, %rax
2002	adc	\$0, %rdx
2003	mov	%rdx, $t1
2004
2005	mulq	$acc6			# a[1]*a[3]
2006	add	%rax, $acc4
2007	 mov	$acc0, %rax
2008	adc	\$0, %rdx
2009	add	$t1, $acc4
2010	mov	%rdx, $acc5
2011	adc	\$0, $acc5
2012
2013	#################################
2014	mulq	$acc7			# a[2]*a[3]
2015	xor	$acc7, $acc7
2016	add	%rax, $acc5
2017	 mov	8*0($a_ptr), %rax
2018	mov	%rdx, $acc6
2019	adc	\$0, $acc6
2020
2021	add	$acc1, $acc1		# acc1:6<<1
2022	adc	$acc2, $acc2
2023	adc	$acc3, $acc3
2024	adc	$acc4, $acc4
2025	adc	$acc5, $acc5
2026	adc	$acc6, $acc6
2027	adc	\$0, $acc7
2028
2029	mulq	%rax
2030	mov	%rax, $acc0
2031	mov	8*1($a_ptr), %rax
2032	mov	%rdx, $t0
2033
2034	mulq	%rax
2035	add	$t0, $acc1
2036	adc	%rax, $acc2
2037	mov	8*2($a_ptr), %rax
2038	adc	\$0, %rdx
2039	mov	%rdx, $t0
2040
2041	mulq	%rax
2042	add	$t0, $acc3
2043	adc	%rax, $acc4
2044	mov	8*3($a_ptr), %rax
2045	adc	\$0, %rdx
2046	mov	%rdx, $t0
2047
2048	mulq	%rax
2049	add	$t0, $acc5
2050	adc	%rax, $acc6
2051	 mov	$acc0, %rax
2052	adc	%rdx, $acc7
2053
2054	mov	.Lpoly+8*1(%rip), $a_ptr
2055	mov	.Lpoly+8*3(%rip), $t1
2056
2057	##########################################
2058	# Now the reduction
2059	# First iteration
2060	mov	$acc0, $t0
2061	shl	\$32, $acc0
2062	mulq	$t1
2063	shr	\$32, $t0
2064	add	$acc0, $acc1		# +=acc[0]<<96
2065	adc	$t0, $acc2
2066	adc	%rax, $acc3
2067	 mov	$acc1, %rax
2068	adc	\$0, %rdx
2069
2070	##########################################
2071	# Second iteration
2072	mov	$acc1, $t0
2073	shl	\$32, $acc1
2074	mov	%rdx, $acc0
2075	mulq	$t1
2076	shr	\$32, $t0
2077	add	$acc1, $acc2
2078	adc	$t0, $acc3
2079	adc	%rax, $acc0
2080	 mov	$acc2, %rax
2081	adc	\$0, %rdx
2082
2083	##########################################
2084	# Third iteration
2085	mov	$acc2, $t0
2086	shl	\$32, $acc2
2087	mov	%rdx, $acc1
2088	mulq	$t1
2089	shr	\$32, $t0
2090	add	$acc2, $acc3
2091	adc	$t0, $acc0
2092	adc	%rax, $acc1
2093	 mov	$acc3, %rax
2094	adc	\$0, %rdx
2095
2096	###########################################
2097	# Last iteration
2098	mov	$acc3, $t0
2099	shl	\$32, $acc3
2100	mov	%rdx, $acc2
2101	mulq	$t1
2102	shr	\$32, $t0
2103	add	$acc3, $acc0
2104	adc	$t0, $acc1
2105	adc	%rax, $acc2
2106	adc	\$0, %rdx
2107	xor	$acc3, $acc3
2108
2109	############################################
2110	# Add the rest of the acc
2111	add	$acc0, $acc4
2112	adc	$acc1, $acc5
2113	 mov	$acc4, $acc0
2114	adc	$acc2, $acc6
2115	adc	%rdx, $acc7
2116	 mov	$acc5, $acc1
2117	adc	\$0, $acc3
2118
2119	sub	\$-1, $acc4		# .Lpoly[0]
2120	 mov	$acc6, $acc2
2121	sbb	$a_ptr, $acc5		# .Lpoly[1]
2122	sbb	\$0, $acc6		# .Lpoly[2]
2123	 mov	$acc7, $t0
2124	sbb	$t1, $acc7		# .Lpoly[3]
2125	sbb	\$0, $acc3
2126
2127	cmovc	$acc0, $acc4
2128	cmovc	$acc1, $acc5
2129	mov	$acc4, 8*0($r_ptr)
2130	cmovc	$acc2, $acc6
2131	mov	$acc5, 8*1($r_ptr)
2132	cmovc	$t0, $acc7
2133	mov	$acc6, 8*2($r_ptr)
2134	mov	$acc7, 8*3($r_ptr)
2135
2136	ret
2137.cfi_endproc
2138.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
2139___
2140
2141if ($addx) {
2142$code.=<<___;
2143.type	__ecp_nistz256_mul_montx,\@abi-omnipotent
2144.align	32
2145__ecp_nistz256_mul_montx:
2146.cfi_startproc
2147	########################################################################
2148	# Multiply by b[0]
2149	mulx	$acc1, $acc0, $acc1
2150	mulx	$acc2, $t0, $acc2
2151	mov	\$32, $poly1
2152	xor	$acc5, $acc5		# cf=0
2153	mulx	$acc3, $t1, $acc3
2154	mov	.Lpoly+8*3(%rip), $poly3
2155	adc	$t0, $acc1
2156	mulx	$acc4, $t0, $acc4
2157	 mov	$acc0, %rdx
2158	adc	$t1, $acc2
2159	 shlx	$poly1,$acc0,$t1
2160	adc	$t0, $acc3
2161	 shrx	$poly1,$acc0,$t0
2162	adc	\$0, $acc4
2163
2164	########################################################################
2165	# First reduction step
2166	add	$t1, $acc1
2167	adc	$t0, $acc2
2168
2169	mulx	$poly3, $t0, $t1
2170	 mov	8*1($b_ptr), %rdx
2171	adc	$t0, $acc3
2172	adc	$t1, $acc4
2173	adc	\$0, $acc5
2174	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
2175
2176	########################################################################
2177	# Multiply by b[1]
2178	mulx	8*0+128($a_ptr), $t0, $t1
2179	adcx	$t0, $acc1
2180	adox	$t1, $acc2
2181
2182	mulx	8*1+128($a_ptr), $t0, $t1
2183	adcx	$t0, $acc2
2184	adox	$t1, $acc3
2185
2186	mulx	8*2+128($a_ptr), $t0, $t1
2187	adcx	$t0, $acc3
2188	adox	$t1, $acc4
2189
2190	mulx	8*3+128($a_ptr), $t0, $t1
2191	 mov	$acc1, %rdx
2192	adcx	$t0, $acc4
2193	 shlx	$poly1, $acc1, $t0
2194	adox	$t1, $acc5
2195	 shrx	$poly1, $acc1, $t1
2196
2197	adcx	$acc0, $acc5
2198	adox	$acc0, $acc0
2199	adc	\$0, $acc0
2200
2201	########################################################################
2202	# Second reduction step
2203	add	$t0, $acc2
2204	adc	$t1, $acc3
2205
2206	mulx	$poly3, $t0, $t1
2207	 mov	8*2($b_ptr), %rdx
2208	adc	$t0, $acc4
2209	adc	$t1, $acc5
2210	adc	\$0, $acc0
2211	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
2212
2213	########################################################################
2214	# Multiply by b[2]
2215	mulx	8*0+128($a_ptr), $t0, $t1
2216	adcx	$t0, $acc2
2217	adox	$t1, $acc3
2218
2219	mulx	8*1+128($a_ptr), $t0, $t1
2220	adcx	$t0, $acc3
2221	adox	$t1, $acc4
2222
2223	mulx	8*2+128($a_ptr), $t0, $t1
2224	adcx	$t0, $acc4
2225	adox	$t1, $acc5
2226
2227	mulx	8*3+128($a_ptr), $t0, $t1
2228	 mov	$acc2, %rdx
2229	adcx	$t0, $acc5
2230	 shlx	$poly1, $acc2, $t0
2231	adox	$t1, $acc0
2232	 shrx	$poly1, $acc2, $t1
2233
2234	adcx	$acc1, $acc0
2235	adox	$acc1, $acc1
2236	adc	\$0, $acc1
2237
2238	########################################################################
2239	# Third reduction step
2240	add	$t0, $acc3
2241	adc	$t1, $acc4
2242
2243	mulx	$poly3, $t0, $t1
2244	 mov	8*3($b_ptr), %rdx
2245	adc	$t0, $acc5
2246	adc	$t1, $acc0
2247	adc	\$0, $acc1
2248	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
2249
2250	########################################################################
2251	# Multiply by b[3]
2252	mulx	8*0+128($a_ptr), $t0, $t1
2253	adcx	$t0, $acc3
2254	adox	$t1, $acc4
2255
2256	mulx	8*1+128($a_ptr), $t0, $t1
2257	adcx	$t0, $acc4
2258	adox	$t1, $acc5
2259
2260	mulx	8*2+128($a_ptr), $t0, $t1
2261	adcx	$t0, $acc5
2262	adox	$t1, $acc0
2263
2264	mulx	8*3+128($a_ptr), $t0, $t1
2265	 mov	$acc3, %rdx
2266	adcx	$t0, $acc0
2267	 shlx	$poly1, $acc3, $t0
2268	adox	$t1, $acc1
2269	 shrx	$poly1, $acc3, $t1
2270
2271	adcx	$acc2, $acc1
2272	adox	$acc2, $acc2
2273	adc	\$0, $acc2
2274
2275	########################################################################
2276	# Fourth reduction step
2277	add	$t0, $acc4
2278	adc	$t1, $acc5
2279
2280	mulx	$poly3, $t0, $t1
2281	 mov	$acc4, $t2
2282	mov	.Lpoly+8*1(%rip), $poly1
2283	adc	$t0, $acc0
2284	 mov	$acc5, $t3
2285	adc	$t1, $acc1
2286	adc	\$0, $acc2
2287
2288	########################################################################
2289	# Branch-less conditional subtraction of P
2290	xor	%eax, %eax
2291	 mov	$acc0, $t0
2292	sbb	\$-1, $acc4		# .Lpoly[0]
2293	sbb	$poly1, $acc5		# .Lpoly[1]
2294	sbb	\$0, $acc0		# .Lpoly[2]
2295	 mov	$acc1, $t1
2296	sbb	$poly3, $acc1		# .Lpoly[3]
2297	sbb	\$0, $acc2
2298
2299	cmovc	$t2, $acc4
2300	cmovc	$t3, $acc5
2301	mov	$acc4, 8*0($r_ptr)
2302	cmovc	$t0, $acc0
2303	mov	$acc5, 8*1($r_ptr)
2304	cmovc	$t1, $acc1
2305	mov	$acc0, 8*2($r_ptr)
2306	mov	$acc1, 8*3($r_ptr)
2307
2308	ret
2309.cfi_endproc
2310.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
2311
2312.type	__ecp_nistz256_sqr_montx,\@abi-omnipotent
2313.align	32
2314__ecp_nistz256_sqr_montx:
2315.cfi_startproc
2316	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
2317	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
2318	xor	%eax, %eax
2319	adc	$t0, $acc2
2320	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
2321	 mov	$acc6, %rdx
2322	adc	$t1, $acc3
2323	adc	\$0, $acc4
2324	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
2325
2326	#################################
2327	mulx	$acc7, $t0, $t1		# a[1]*a[2]
2328	adcx	$t0, $acc3
2329	adox	$t1, $acc4
2330
2331	mulx	$acc0, $t0, $t1		# a[1]*a[3]
2332	 mov	$acc7, %rdx
2333	adcx	$t0, $acc4
2334	adox	$t1, $acc5
2335	adc	\$0, $acc5
2336
2337	#################################
2338	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
2339	 mov	8*0+128($a_ptr), %rdx
2340	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
2341	 adcx	$acc1, $acc1		# acc1:6<<1
2342	adox	$t0, $acc5
2343	 adcx	$acc2, $acc2
2344	adox	$acc7, $acc6		# of=0
2345
2346	mulx	%rdx, $acc0, $t1
2347	mov	8*1+128($a_ptr), %rdx
2348	 adcx	$acc3, $acc3
2349	adox	$t1, $acc1
2350	 adcx	$acc4, $acc4
2351	mulx	%rdx, $t0, $t4
2352	mov	8*2+128($a_ptr), %rdx
2353	 adcx	$acc5, $acc5
2354	adox	$t0, $acc2
2355	 adcx	$acc6, $acc6
2356	.byte	0x67
2357	mulx	%rdx, $t0, $t1
2358	mov	8*3+128($a_ptr), %rdx
2359	adox	$t4, $acc3
2360	 adcx	$acc7, $acc7
2361	adox	$t0, $acc4
2362	 mov	\$32, $a_ptr
2363	adox	$t1, $acc5
2364	.byte	0x67,0x67
2365	mulx	%rdx, $t0, $t4
2366	 mov	.Lpoly+8*3(%rip), %rdx
2367	adox	$t0, $acc6
2368	 shlx	$a_ptr, $acc0, $t0
2369	adox	$t4, $acc7
2370	 shrx	$a_ptr, $acc0, $t4
2371	mov	%rdx,$t1
2372
2373	# reduction step 1
2374	add	$t0, $acc1
2375	adc	$t4, $acc2
2376
2377	mulx	$acc0, $t0, $acc0
2378	adc	$t0, $acc3
2379	 shlx	$a_ptr, $acc1, $t0
2380	adc	\$0, $acc0
2381	 shrx	$a_ptr, $acc1, $t4
2382
2383	# reduction step 2
2384	add	$t0, $acc2
2385	adc	$t4, $acc3
2386
2387	mulx	$acc1, $t0, $acc1
2388	adc	$t0, $acc0
2389	 shlx	$a_ptr, $acc2, $t0
2390	adc	\$0, $acc1
2391	 shrx	$a_ptr, $acc2, $t4
2392
2393	# reduction step 3
2394	add	$t0, $acc3
2395	adc	$t4, $acc0
2396
2397	mulx	$acc2, $t0, $acc2
2398	adc	$t0, $acc1
2399	 shlx	$a_ptr, $acc3, $t0
2400	adc	\$0, $acc2
2401	 shrx	$a_ptr, $acc3, $t4
2402
2403	# reduction step 4
2404	add	$t0, $acc0
2405	adc	$t4, $acc1
2406
2407	mulx	$acc3, $t0, $acc3
2408	adc	$t0, $acc2
2409	adc	\$0, $acc3
2410
2411	xor	$t3, $t3
2412	add	$acc0, $acc4		# accumulate upper half
2413	 mov	.Lpoly+8*1(%rip), $a_ptr
2414	adc	$acc1, $acc5
2415	 mov	$acc4, $acc0
2416	adc	$acc2, $acc6
2417	adc	$acc3, $acc7
2418	 mov	$acc5, $acc1
2419	adc	\$0, $t3
2420
2421	sub	\$-1, $acc4		# .Lpoly[0]
2422	 mov	$acc6, $acc2
2423	sbb	$a_ptr, $acc5		# .Lpoly[1]
2424	sbb	\$0, $acc6		# .Lpoly[2]
2425	 mov	$acc7, $acc3
2426	sbb	$t1, $acc7		# .Lpoly[3]
2427	sbb	\$0, $t3
2428
2429	cmovc	$acc0, $acc4
2430	cmovc	$acc1, $acc5
2431	mov	$acc4, 8*0($r_ptr)
2432	cmovc	$acc2, $acc6
2433	mov	$acc5, 8*1($r_ptr)
2434	cmovc	$acc3, $acc7
2435	mov	$acc6, 8*2($r_ptr)
2436	mov	$acc7, 8*3($r_ptr)
2437
2438	ret
2439.cfi_endproc
2440.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
2441___
2442}
2443}
2444{
2445my ($r_ptr,$in_ptr)=("%rdi","%rsi");
2446my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
2447my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
2448
2449$code.=<<___;
2450################################################################################
2451# void ecp_nistz256_from_mont(
2452#   uint64_t res[4],
2453#   uint64_t in[4]);
2454# This one performs Montgomery multiplication by 1, so we only need the reduction
2455
2456.globl	ecp_nistz256_from_mont
2457.type	ecp_nistz256_from_mont,\@function,2
2458.align	32
2459ecp_nistz256_from_mont:
2460.cfi_startproc
2461	push	%r12
2462.cfi_push	%r12
2463	push	%r13
2464.cfi_push	%r13
2465.Lfrom_body:
2466
2467	mov	8*0($in_ptr), %rax
2468	mov	.Lpoly+8*3(%rip), $t2
2469	mov	8*1($in_ptr), $acc1
2470	mov	8*2($in_ptr), $acc2
2471	mov	8*3($in_ptr), $acc3
2472	mov	%rax, $acc0
2473	mov	.Lpoly+8*1(%rip), $t1
2474
2475	#########################################
2476	# First iteration
2477	mov	%rax, $t0
2478	shl	\$32, $acc0
2479	mulq	$t2
2480	shr	\$32, $t0
2481	add	$acc0, $acc1
2482	adc	$t0, $acc2
2483	adc	%rax, $acc3
2484	 mov	$acc1, %rax
2485	adc	\$0, %rdx
2486
2487	#########################################
2488	# Second iteration
2489	mov	$acc1, $t0
2490	shl	\$32, $acc1
2491	mov	%rdx, $acc0
2492	mulq	$t2
2493	shr	\$32, $t0
2494	add	$acc1, $acc2
2495	adc	$t0, $acc3
2496	adc	%rax, $acc0
2497	 mov	$acc2, %rax
2498	adc	\$0, %rdx
2499
2500	##########################################
2501	# Third iteration
2502	mov	$acc2, $t0
2503	shl	\$32, $acc2
2504	mov	%rdx, $acc1
2505	mulq	$t2
2506	shr	\$32, $t0
2507	add	$acc2, $acc3
2508	adc	$t0, $acc0
2509	adc	%rax, $acc1
2510	 mov	$acc3, %rax
2511	adc	\$0, %rdx
2512
2513	###########################################
2514	# Last iteration
2515	mov	$acc3, $t0
2516	shl	\$32, $acc3
2517	mov	%rdx, $acc2
2518	mulq	$t2
2519	shr	\$32, $t0
2520	add	$acc3, $acc0
2521	adc	$t0, $acc1
2522	 mov	$acc0, $t0
2523	adc	%rax, $acc2
2524	 mov	$acc1, $in_ptr
2525	adc	\$0, %rdx
2526
2527	###########################################
2528	# Branch-less conditional subtraction
2529	sub	\$-1, $acc0
2530	 mov	$acc2, %rax
2531	sbb	$t1, $acc1
2532	sbb	\$0, $acc2
2533	 mov	%rdx, $acc3
2534	sbb	$t2, %rdx
2535	sbb	$t2, $t2
2536
2537	cmovnz	$t0, $acc0
2538	cmovnz	$in_ptr, $acc1
2539	mov	$acc0, 8*0($r_ptr)
2540	cmovnz	%rax, $acc2
2541	mov	$acc1, 8*1($r_ptr)
2542	cmovz	%rdx, $acc3
2543	mov	$acc2, 8*2($r_ptr)
2544	mov	$acc3, 8*3($r_ptr)
2545
2546	mov	0(%rsp),%r13
2547.cfi_restore	%r13
2548	mov	8(%rsp),%r12
2549.cfi_restore	%r12
2550	lea	16(%rsp),%rsp
2551.cfi_adjust_cfa_offset	-16
2552.Lfrom_epilogue:
2553	ret
2554.cfi_endproc
2555.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
2556___
2557}
2558{
2559my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2560my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
2561my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
2562my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
2563
2564$code.=<<___;
2565################################################################################
2566# void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index);
2567.globl	ecp_nistz256_scatter_w5
2568.type	ecp_nistz256_scatter_w5,\@abi-omnipotent
2569.align	32
2570ecp_nistz256_scatter_w5:
2571.cfi_startproc
2572	lea	-3($index,$index,2), $index
2573	movdqa	0x00($in_t), %xmm0
2574	shl	\$5, $index
2575	movdqa	0x10($in_t), %xmm1
2576	movdqa	0x20($in_t), %xmm2
2577	movdqa	0x30($in_t), %xmm3
2578	movdqa	0x40($in_t), %xmm4
2579	movdqa	0x50($in_t), %xmm5
2580	movdqa	%xmm0, 0x00($val,$index)
2581	movdqa	%xmm1, 0x10($val,$index)
2582	movdqa	%xmm2, 0x20($val,$index)
2583	movdqa	%xmm3, 0x30($val,$index)
2584	movdqa	%xmm4, 0x40($val,$index)
2585	movdqa	%xmm5, 0x50($val,$index)
2586
2587	ret
2588.cfi_endproc
2589.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
2590
2591################################################################################
2592# void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index);
2593.globl	ecp_nistz256_gather_w5
2594.type	ecp_nistz256_gather_w5,\@abi-omnipotent
2595.align	32
2596ecp_nistz256_gather_w5:
2597.cfi_startproc
2598___
2599$code.=<<___	if ($avx>1);
2600	mov	OPENSSL_ia32cap_P+8(%rip), %eax
2601	test	\$`1<<5`, %eax
2602	jnz	.Lavx2_gather_w5
2603___
2604$code.=<<___	if ($win64);
2605	lea	-0x88(%rsp), %rax
2606.LSEH_begin_ecp_nistz256_gather_w5:
2607	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
2608	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
2609	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
2610	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
2611	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
2612	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
2613	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
2614	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
2615	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
2616	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
2617	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
2618___
2619$code.=<<___;
2620	movdqa	.LOne(%rip), $ONE
2621	movd	$index, $INDEX
2622
2623	pxor	$Ra, $Ra
2624	pxor	$Rb, $Rb
2625	pxor	$Rc, $Rc
2626	pxor	$Rd, $Rd
2627	pxor	$Re, $Re
2628	pxor	$Rf, $Rf
2629
2630	movdqa	$ONE, $M0
2631	pshufd	\$0, $INDEX, $INDEX
2632
2633	mov	\$16, %rax
2634.Lselect_loop_sse_w5:
2635
2636	movdqa	$M0, $TMP0
2637	paddd	$ONE, $M0
2638	pcmpeqd $INDEX, $TMP0
2639
2640	movdqa	16*0($in_t), $T0a
2641	movdqa	16*1($in_t), $T0b
2642	movdqa	16*2($in_t), $T0c
2643	movdqa	16*3($in_t), $T0d
2644	movdqa	16*4($in_t), $T0e
2645	movdqa	16*5($in_t), $T0f
2646	lea 16*6($in_t), $in_t
2647
2648	pand	$TMP0, $T0a
2649	pand	$TMP0, $T0b
2650	por	$T0a, $Ra
2651	pand	$TMP0, $T0c
2652	por	$T0b, $Rb
2653	pand	$TMP0, $T0d
2654	por	$T0c, $Rc
2655	pand	$TMP0, $T0e
2656	por	$T0d, $Rd
2657	pand	$TMP0, $T0f
2658	por	$T0e, $Re
2659	por	$T0f, $Rf
2660
2661	dec	%rax
2662	jnz	.Lselect_loop_sse_w5
2663
2664	movdqu	$Ra, 16*0($val)
2665	movdqu	$Rb, 16*1($val)
2666	movdqu	$Rc, 16*2($val)
2667	movdqu	$Rd, 16*3($val)
2668	movdqu	$Re, 16*4($val)
2669	movdqu	$Rf, 16*5($val)
2670___
2671$code.=<<___	if ($win64);
2672	movaps	(%rsp), %xmm6
2673	movaps	0x10(%rsp), %xmm7
2674	movaps	0x20(%rsp), %xmm8
2675	movaps	0x30(%rsp), %xmm9
2676	movaps	0x40(%rsp), %xmm10
2677	movaps	0x50(%rsp), %xmm11
2678	movaps	0x60(%rsp), %xmm12
2679	movaps	0x70(%rsp), %xmm13
2680	movaps	0x80(%rsp), %xmm14
2681	movaps	0x90(%rsp), %xmm15
2682	lea	0xa8(%rsp), %rsp
2683___
2684$code.=<<___;
2685	ret
2686.cfi_endproc
2687.LSEH_end_ecp_nistz256_gather_w5:
2688.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
2689
2690################################################################################
2691# void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index);
2692.globl	ecp_nistz256_scatter_w7
2693.type	ecp_nistz256_scatter_w7,\@abi-omnipotent
2694.align	32
2695ecp_nistz256_scatter_w7:
2696.cfi_startproc
2697	movdqu	0x00($in_t), %xmm0
2698	shl	\$6, $index
2699	movdqu	0x10($in_t), %xmm1
2700	movdqu	0x20($in_t), %xmm2
2701	movdqu	0x30($in_t), %xmm3
2702	movdqa	%xmm0, 0x00($val,$index)
2703	movdqa	%xmm1, 0x10($val,$index)
2704	movdqa	%xmm2, 0x20($val,$index)
2705	movdqa	%xmm3, 0x30($val,$index)
2706
2707	ret
2708.cfi_endproc
2709.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
2710
2711################################################################################
2712# void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index);
2713.globl	ecp_nistz256_gather_w7
2714.type	ecp_nistz256_gather_w7,\@abi-omnipotent
2715.align	32
2716ecp_nistz256_gather_w7:
2717.cfi_startproc
2718___
2719$code.=<<___	if ($avx>1);
2720	mov	OPENSSL_ia32cap_P+8(%rip), %eax
2721	test	\$`1<<5`, %eax
2722	jnz	.Lavx2_gather_w7
2723___
2724$code.=<<___	if ($win64);
2725	lea	-0x88(%rsp), %rax
2726.LSEH_begin_ecp_nistz256_gather_w7:
2727	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
2728	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
2729	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
2730	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
2731	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
2732	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
2733	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
2734	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
2735	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
2736	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
2737	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
2738___
2739$code.=<<___;
2740	movdqa	.LOne(%rip), $M0
2741	movd	$index, $INDEX
2742
2743	pxor	$Ra, $Ra
2744	pxor	$Rb, $Rb
2745	pxor	$Rc, $Rc
2746	pxor	$Rd, $Rd
2747
2748	movdqa	$M0, $ONE
2749	pshufd	\$0, $INDEX, $INDEX
2750	mov	\$64, %rax
2751
2752.Lselect_loop_sse_w7:
2753	movdqa	$M0, $TMP0
2754	paddd	$ONE, $M0
2755	movdqa	16*0($in_t), $T0a
2756	movdqa	16*1($in_t), $T0b
2757	pcmpeqd	$INDEX, $TMP0
2758	movdqa	16*2($in_t), $T0c
2759	movdqa	16*3($in_t), $T0d
2760	lea	16*4($in_t), $in_t
2761
2762	pand	$TMP0, $T0a
2763	pand	$TMP0, $T0b
2764	por	$T0a, $Ra
2765	pand	$TMP0, $T0c
2766	por	$T0b, $Rb
2767	pand	$TMP0, $T0d
2768	por	$T0c, $Rc
2769	prefetcht0	255($in_t)
2770	por	$T0d, $Rd
2771
2772	dec	%rax
2773	jnz	.Lselect_loop_sse_w7
2774
2775	movdqu	$Ra, 16*0($val)
2776	movdqu	$Rb, 16*1($val)
2777	movdqu	$Rc, 16*2($val)
2778	movdqu	$Rd, 16*3($val)
2779___
2780$code.=<<___	if ($win64);
2781	movaps	(%rsp), %xmm6
2782	movaps	0x10(%rsp), %xmm7
2783	movaps	0x20(%rsp), %xmm8
2784	movaps	0x30(%rsp), %xmm9
2785	movaps	0x40(%rsp), %xmm10
2786	movaps	0x50(%rsp), %xmm11
2787	movaps	0x60(%rsp), %xmm12
2788	movaps	0x70(%rsp), %xmm13
2789	movaps	0x80(%rsp), %xmm14
2790	movaps	0x90(%rsp), %xmm15
2791	lea	0xa8(%rsp), %rsp
2792___
2793$code.=<<___;
2794	ret
2795.cfi_endproc
2796.LSEH_end_ecp_nistz256_gather_w7:
2797.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
2798___
2799}
2800if ($avx>1) {
2801my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2802my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
2803my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
2804my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
2805
2806$code.=<<___;
2807################################################################################
2808# void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index);
2809.type	ecp_nistz256_avx2_gather_w5,\@abi-omnipotent
2810.align	32
2811ecp_nistz256_avx2_gather_w5:
2812.cfi_startproc
2813.Lavx2_gather_w5:
2814	vzeroupper
2815___
2816$code.=<<___	if ($win64);
2817	lea	-0x88(%rsp), %rax
2818	mov	%rsp,%r11
2819.LSEH_begin_ecp_nistz256_avx2_gather_w5:
2820	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
2821	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
2822	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
2823	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
2824	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
2825	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
2826	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
2827	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
2828	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
2829	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
2830	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
2831___
2832$code.=<<___;
2833	vmovdqa	.LTwo(%rip), $TWO
2834
2835	vpxor	$Ra, $Ra, $Ra
2836	vpxor	$Rb, $Rb, $Rb
2837	vpxor	$Rc, $Rc, $Rc
2838
2839	vmovdqa .LOne(%rip), $M0
2840	vmovdqa .LTwo(%rip), $M1
2841
2842	vmovd	$index, %xmm1
2843	vpermd	$INDEX, $Ra, $INDEX
2844
2845	mov	\$8, %rax
2846.Lselect_loop_avx2_w5:
2847
2848	vmovdqa	32*0($in_t), $T0a
2849	vmovdqa	32*1($in_t), $T0b
2850	vmovdqa	32*2($in_t), $T0c
2851
2852	vmovdqa	32*3($in_t), $T1a
2853	vmovdqa	32*4($in_t), $T1b
2854	vmovdqa	32*5($in_t), $T1c
2855
2856	vpcmpeqd	$INDEX, $M0, $TMP0
2857	vpcmpeqd	$INDEX, $M1, $TMP1
2858
2859	vpaddd	$TWO, $M0, $M0
2860	vpaddd	$TWO, $M1, $M1
2861	lea	32*6($in_t), $in_t
2862
2863	vpand	$TMP0, $T0a, $T0a
2864	vpand	$TMP0, $T0b, $T0b
2865	vpand	$TMP0, $T0c, $T0c
2866	vpand	$TMP1, $T1a, $T1a
2867	vpand	$TMP1, $T1b, $T1b
2868	vpand	$TMP1, $T1c, $T1c
2869
2870	vpxor	$T0a, $Ra, $Ra
2871	vpxor	$T0b, $Rb, $Rb
2872	vpxor	$T0c, $Rc, $Rc
2873	vpxor	$T1a, $Ra, $Ra
2874	vpxor	$T1b, $Rb, $Rb
2875	vpxor	$T1c, $Rc, $Rc
2876
2877	dec %rax
2878	jnz .Lselect_loop_avx2_w5
2879
2880	vmovdqu $Ra, 32*0($val)
2881	vmovdqu $Rb, 32*1($val)
2882	vmovdqu $Rc, 32*2($val)
2883	vzeroupper
2884___
2885$code.=<<___	if ($win64);
2886	movaps	(%rsp), %xmm6
2887	movaps	0x10(%rsp), %xmm7
2888	movaps	0x20(%rsp), %xmm8
2889	movaps	0x30(%rsp), %xmm9
2890	movaps	0x40(%rsp), %xmm10
2891	movaps	0x50(%rsp), %xmm11
2892	movaps	0x60(%rsp), %xmm12
2893	movaps	0x70(%rsp), %xmm13
2894	movaps	0x80(%rsp), %xmm14
2895	movaps	0x90(%rsp), %xmm15
2896	lea	(%r11), %rsp
2897___
2898$code.=<<___;
2899	ret
2900.cfi_endproc
2901.LSEH_end_ecp_nistz256_avx2_gather_w5:
2902.size	ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
2903___
2904}
2905if ($avx>1) {
2906my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2907my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
2908my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
2909my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
2910my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
2911
2912$code.=<<___;
2913
2914################################################################################
2915# void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index);
2916.globl	ecp_nistz256_avx2_gather_w7
2917.type	ecp_nistz256_avx2_gather_w7,\@abi-omnipotent
2918.align	32
2919ecp_nistz256_avx2_gather_w7:
2920.cfi_startproc
2921.Lavx2_gather_w7:
2922	vzeroupper
2923___
2924$code.=<<___	if ($win64);
2925	mov	%rsp,%r11
2926	lea	-0x88(%rsp), %rax
2927.LSEH_begin_ecp_nistz256_avx2_gather_w7:
2928	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
2929	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
2930	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
2931	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
2932	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
2933	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
2934	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
2935	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
2936	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
2937	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
2938	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
2939___
2940$code.=<<___;
2941	vmovdqa	.LThree(%rip), $THREE
2942
2943	vpxor	$Ra, $Ra, $Ra
2944	vpxor	$Rb, $Rb, $Rb
2945
2946	vmovdqa .LOne(%rip), $M0
2947	vmovdqa .LTwo(%rip), $M1
2948	vmovdqa .LThree(%rip), $M2
2949
2950	vmovd	$index, %xmm1
2951	vpermd	$INDEX, $Ra, $INDEX
2952	# Skip index = 0, because it is implicitly the point at infinity
2953
2954	mov	\$21, %rax
2955.Lselect_loop_avx2_w7:
2956
2957	vmovdqa	32*0($in_t), $T0a
2958	vmovdqa	32*1($in_t), $T0b
2959
2960	vmovdqa	32*2($in_t), $T1a
2961	vmovdqa	32*3($in_t), $T1b
2962
2963	vmovdqa	32*4($in_t), $T2a
2964	vmovdqa	32*5($in_t), $T2b
2965
2966	vpcmpeqd	$INDEX, $M0, $TMP0
2967	vpcmpeqd	$INDEX, $M1, $TMP1
2968	vpcmpeqd	$INDEX, $M2, $TMP2
2969
2970	vpaddd	$THREE, $M0, $M0
2971	vpaddd	$THREE, $M1, $M1
2972	vpaddd	$THREE, $M2, $M2
2973	lea	32*6($in_t), $in_t
2974
2975	vpand	$TMP0, $T0a, $T0a
2976	vpand	$TMP0, $T0b, $T0b
2977	vpand	$TMP1, $T1a, $T1a
2978	vpand	$TMP1, $T1b, $T1b
2979	vpand	$TMP2, $T2a, $T2a
2980	vpand	$TMP2, $T2b, $T2b
2981
2982	vpxor	$T0a, $Ra, $Ra
2983	vpxor	$T0b, $Rb, $Rb
2984	vpxor	$T1a, $Ra, $Ra
2985	vpxor	$T1b, $Rb, $Rb
2986	vpxor	$T2a, $Ra, $Ra
2987	vpxor	$T2b, $Rb, $Rb
2988
2989	dec %rax
2990	jnz .Lselect_loop_avx2_w7
2991
2992
2993	vmovdqa	32*0($in_t), $T0a
2994	vmovdqa	32*1($in_t), $T0b
2995
2996	vpcmpeqd	$INDEX, $M0, $TMP0
2997
2998	vpand	$TMP0, $T0a, $T0a
2999	vpand	$TMP0, $T0b, $T0b
3000
3001	vpxor	$T0a, $Ra, $Ra
3002	vpxor	$T0b, $Rb, $Rb
3003
3004	vmovdqu $Ra, 32*0($val)
3005	vmovdqu $Rb, 32*1($val)
3006	vzeroupper
3007___
3008$code.=<<___	if ($win64);
3009	movaps	(%rsp), %xmm6
3010	movaps	0x10(%rsp), %xmm7
3011	movaps	0x20(%rsp), %xmm8
3012	movaps	0x30(%rsp), %xmm9
3013	movaps	0x40(%rsp), %xmm10
3014	movaps	0x50(%rsp), %xmm11
3015	movaps	0x60(%rsp), %xmm12
3016	movaps	0x70(%rsp), %xmm13
3017	movaps	0x80(%rsp), %xmm14
3018	movaps	0x90(%rsp), %xmm15
3019	lea	(%r11), %rsp
3020___
3021$code.=<<___;
3022	ret
3023.cfi_endproc
3024.LSEH_end_ecp_nistz256_avx2_gather_w7:
3025.size	ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
3026___
3027} else {
3028$code.=<<___;
3029.globl	ecp_nistz256_avx2_gather_w7
3030.type	ecp_nistz256_avx2_gather_w7,\@function,3
3031.align	32
3032ecp_nistz256_avx2_gather_w7:
3033.cfi_startproc
3034	.byte	0x0f,0x0b	# ud2
3035	ret
3036.cfi_endproc
3037.size	ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
3038___
3039}
3040{{{
3041########################################################################
3042# This block implements higher level point_double, point_add and
3043# point_add_affine. The key to performance in this case is to allow
3044# out-of-order execution logic to overlap computations from next step
3045# with tail processing from current step. By using tailored calling
3046# sequence we minimize inter-step overhead to give processor better
3047# shot at overlapping operations...
3048#
3049# You will notice that input data is copied to stack. Trouble is that
3050# there are no registers to spare for holding original pointers and
3051# reloading them, pointers, would create undesired dependencies on
3052# effective addresses calculation paths. In other words it's too done
3053# to favour out-of-order execution logic.
3054#						<appro@openssl.org>
3055
3056my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
3057my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
3058my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
3059my ($poly1,$poly3)=($acc6,$acc7);
3060
3061sub load_for_mul () {
3062my ($a,$b,$src0) = @_;
3063my $bias = $src0 eq "%rax" ? 0 : -128;
3064
3065"	mov	$b, $src0
3066	lea	$b, $b_ptr
3067	mov	8*0+$a, $acc1
3068	mov	8*1+$a, $acc2
3069	lea	$bias+$a, $a_ptr
3070	mov	8*2+$a, $acc3
3071	mov	8*3+$a, $acc4"
3072}
3073
3074sub load_for_sqr () {
3075my ($a,$src0) = @_;
3076my $bias = $src0 eq "%rax" ? 0 : -128;
3077
3078"	mov	8*0+$a, $src0
3079	mov	8*1+$a, $acc6
3080	lea	$bias+$a, $a_ptr
3081	mov	8*2+$a, $acc7
3082	mov	8*3+$a, $acc0"
3083}
3084
3085									{
3086########################################################################
3087# operate in 4-5-0-1 "name space" that matches multiplication output
3088#
3089my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3090
3091$code.=<<___;
3092.type	__ecp_nistz256_add_toq,\@abi-omnipotent
3093.align	32
3094__ecp_nistz256_add_toq:
3095.cfi_startproc
3096	xor	$t4,$t4
3097	add	8*0($b_ptr), $a0
3098	adc	8*1($b_ptr), $a1
3099	 mov	$a0, $t0
3100	adc	8*2($b_ptr), $a2
3101	adc	8*3($b_ptr), $a3
3102	 mov	$a1, $t1
3103	adc	\$0, $t4
3104
3105	sub	\$-1, $a0
3106	 mov	$a2, $t2
3107	sbb	$poly1, $a1
3108	sbb	\$0, $a2
3109	 mov	$a3, $t3
3110	sbb	$poly3, $a3
3111	sbb	\$0, $t4
3112
3113	cmovc	$t0, $a0
3114	cmovc	$t1, $a1
3115	mov	$a0, 8*0($r_ptr)
3116	cmovc	$t2, $a2
3117	mov	$a1, 8*1($r_ptr)
3118	cmovc	$t3, $a3
3119	mov	$a2, 8*2($r_ptr)
3120	mov	$a3, 8*3($r_ptr)
3121
3122	ret
3123.cfi_endproc
3124.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
3125
3126.type	__ecp_nistz256_sub_fromq,\@abi-omnipotent
3127.align	32
3128__ecp_nistz256_sub_fromq:
3129.cfi_startproc
3130	sub	8*0($b_ptr), $a0
3131	sbb	8*1($b_ptr), $a1
3132	 mov	$a0, $t0
3133	sbb	8*2($b_ptr), $a2
3134	sbb	8*3($b_ptr), $a3
3135	 mov	$a1, $t1
3136	sbb	$t4, $t4
3137
3138	add	\$-1, $a0
3139	 mov	$a2, $t2
3140	adc	$poly1, $a1
3141	adc	\$0, $a2
3142	 mov	$a3, $t3
3143	adc	$poly3, $a3
3144	test	$t4, $t4
3145
3146	cmovz	$t0, $a0
3147	cmovz	$t1, $a1
3148	mov	$a0, 8*0($r_ptr)
3149	cmovz	$t2, $a2
3150	mov	$a1, 8*1($r_ptr)
3151	cmovz	$t3, $a3
3152	mov	$a2, 8*2($r_ptr)
3153	mov	$a3, 8*3($r_ptr)
3154
3155	ret
3156.cfi_endproc
3157.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
3158
3159.type	__ecp_nistz256_subq,\@abi-omnipotent
3160.align	32
3161__ecp_nistz256_subq:
3162.cfi_startproc
3163	sub	$a0, $t0
3164	sbb	$a1, $t1
3165	 mov	$t0, $a0
3166	sbb	$a2, $t2
3167	sbb	$a3, $t3
3168	 mov	$t1, $a1
3169	sbb	$t4, $t4
3170
3171	add	\$-1, $t0
3172	 mov	$t2, $a2
3173	adc	$poly1, $t1
3174	adc	\$0, $t2
3175	 mov	$t3, $a3
3176	adc	$poly3, $t3
3177	test	$t4, $t4
3178
3179	cmovnz	$t0, $a0
3180	cmovnz	$t1, $a1
3181	cmovnz	$t2, $a2
3182	cmovnz	$t3, $a3
3183
3184	ret
3185.cfi_endproc
3186.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
3187
3188.type	__ecp_nistz256_mul_by_2q,\@abi-omnipotent
3189.align	32
3190__ecp_nistz256_mul_by_2q:
3191.cfi_startproc
3192	xor	$t4, $t4
3193	add	$a0, $a0		# a0:a3+a0:a3
3194	adc	$a1, $a1
3195	 mov	$a0, $t0
3196	adc	$a2, $a2
3197	adc	$a3, $a3
3198	 mov	$a1, $t1
3199	adc	\$0, $t4
3200
3201	sub	\$-1, $a0
3202	 mov	$a2, $t2
3203	sbb	$poly1, $a1
3204	sbb	\$0, $a2
3205	 mov	$a3, $t3
3206	sbb	$poly3, $a3
3207	sbb	\$0, $t4
3208
3209	cmovc	$t0, $a0
3210	cmovc	$t1, $a1
3211	mov	$a0, 8*0($r_ptr)
3212	cmovc	$t2, $a2
3213	mov	$a1, 8*1($r_ptr)
3214	cmovc	$t3, $a3
3215	mov	$a2, 8*2($r_ptr)
3216	mov	$a3, 8*3($r_ptr)
3217
3218	ret
3219.cfi_endproc
3220.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
3221___
3222									}
3223sub gen_double () {
3224    my $x = shift;
3225    my ($src0,$sfx,$bias);
3226    my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
3227
3228    if ($x ne "x") {
3229	$src0 = "%rax";
3230	$sfx  = "";
3231	$bias = 0;
3232
3233$code.=<<___;
3234.globl	ecp_nistz256_point_double
3235.type	ecp_nistz256_point_double,\@function,2
3236.align	32
3237ecp_nistz256_point_double:
3238.cfi_startproc
3239___
3240$code.=<<___	if ($addx);
3241	mov	\$0x80100, %ecx
3242	and	OPENSSL_ia32cap_P+8(%rip), %ecx
3243	cmp	\$0x80100, %ecx
3244	je	.Lpoint_doublex
3245___
3246    } else {
3247	$src0 = "%rdx";
3248	$sfx  = "x";
3249	$bias = 128;
3250
3251$code.=<<___;
3252.type	ecp_nistz256_point_doublex,\@function,2
3253.align	32
3254ecp_nistz256_point_doublex:
3255.cfi_startproc
3256.Lpoint_doublex:
3257___
3258    }
3259$code.=<<___;
3260	push	%rbp
3261.cfi_push	%rbp
3262	push	%rbx
3263.cfi_push	%rbx
3264	push	%r12
3265.cfi_push	%r12
3266	push	%r13
3267.cfi_push	%r13
3268	push	%r14
3269.cfi_push	%r14
3270	push	%r15
3271.cfi_push	%r15
3272	sub	\$32*5+8, %rsp
3273.cfi_adjust_cfa_offset	32*5+8
3274.Lpoint_double${x}_body:
3275
3276.Lpoint_double_shortcut$x:
3277	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
3278	mov	$a_ptr, $b_ptr			# backup copy
3279	movdqu	0x10($a_ptr), %xmm1
3280	 mov	0x20+8*0($a_ptr), $acc4		# load in_y in "5-4-0-1" order
3281	 mov	0x20+8*1($a_ptr), $acc5
3282	 mov	0x20+8*2($a_ptr), $acc0
3283	 mov	0x20+8*3($a_ptr), $acc1
3284	 mov	.Lpoly+8*1(%rip), $poly1
3285	 mov	.Lpoly+8*3(%rip), $poly3
3286	movdqa	%xmm0, $in_x(%rsp)
3287	movdqa	%xmm1, $in_x+0x10(%rsp)
3288	lea	0x20($r_ptr), $acc2
3289	lea	0x40($r_ptr), $acc3
3290	movq	$r_ptr, %xmm0
3291	movq	$acc2, %xmm1
3292	movq	$acc3, %xmm2
3293
3294	lea	$S(%rsp), $r_ptr
3295	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(S, in_y);
3296
3297	mov	0x40+8*0($a_ptr), $src0
3298	mov	0x40+8*1($a_ptr), $acc6
3299	mov	0x40+8*2($a_ptr), $acc7
3300	mov	0x40+8*3($a_ptr), $acc0
3301	lea	0x40-$bias($a_ptr), $a_ptr
3302	lea	$Zsqr(%rsp), $r_ptr
3303	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Zsqr, in_z);
3304
3305	`&load_for_sqr("$S(%rsp)", "$src0")`
3306	lea	$S(%rsp), $r_ptr
3307	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(S, S);
3308
3309	mov	0x20($b_ptr), $src0		# $b_ptr is still valid
3310	mov	0x40+8*0($b_ptr), $acc1
3311	mov	0x40+8*1($b_ptr), $acc2
3312	mov	0x40+8*2($b_ptr), $acc3
3313	mov	0x40+8*3($b_ptr), $acc4
3314	lea	0x40-$bias($b_ptr), $a_ptr
3315	lea	0x20($b_ptr), $b_ptr
3316	movq	%xmm2, $r_ptr
3317	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, in_z, in_y);
3318	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(res_z, res_z);
3319
3320	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
3321	mov	$in_x+8*1(%rsp), $acc5
3322	lea	$Zsqr(%rsp), $b_ptr
3323	mov	$in_x+8*2(%rsp), $acc0
3324	mov	$in_x+8*3(%rsp), $acc1
3325	lea	$M(%rsp), $r_ptr
3326	call	__ecp_nistz256_add_to$x		# p256_add(M, in_x, Zsqr);
3327
3328	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
3329	mov	$in_x+8*1(%rsp), $acc5
3330	lea	$Zsqr(%rsp), $b_ptr
3331	mov	$in_x+8*2(%rsp), $acc0
3332	mov	$in_x+8*3(%rsp), $acc1
3333	lea	$Zsqr(%rsp), $r_ptr
3334	call	__ecp_nistz256_sub_from$x	# p256_sub(Zsqr, in_x, Zsqr);
3335
3336	`&load_for_sqr("$S(%rsp)", "$src0")`
3337	movq	%xmm1, $r_ptr
3338	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_y, S);
3339___
3340{
3341######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
3342# operate in 4-5-6-7 "name space" that matches squaring output
3343#
3344my ($poly1,$poly3)=($a_ptr,$t1);
3345my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
3346
3347$code.=<<___;
3348	xor	$t4, $t4
3349	mov	$a0, $t0
3350	add	\$-1, $a0
3351	mov	$a1, $t1
3352	adc	$poly1, $a1
3353	mov	$a2, $t2
3354	adc	\$0, $a2
3355	mov	$a3, $t3
3356	adc	$poly3, $a3
3357	adc	\$0, $t4
3358	xor	$a_ptr, $a_ptr		# borrow $a_ptr
3359	test	\$1, $t0
3360
3361	cmovz	$t0, $a0
3362	cmovz	$t1, $a1
3363	cmovz	$t2, $a2
3364	cmovz	$t3, $a3
3365	cmovz	$a_ptr, $t4
3366
3367	mov	$a1, $t0		# a0:a3>>1
3368	shr	\$1, $a0
3369	shl	\$63, $t0
3370	mov	$a2, $t1
3371	shr	\$1, $a1
3372	or	$t0, $a0
3373	shl	\$63, $t1
3374	mov	$a3, $t2
3375	shr	\$1, $a2
3376	or	$t1, $a1
3377	shl	\$63, $t2
3378	mov	$a0, 8*0($r_ptr)
3379	shr	\$1, $a3
3380	mov	$a1, 8*1($r_ptr)
3381	shl	\$63, $t4
3382	or	$t2, $a2
3383	or	$t4, $a3
3384	mov	$a2, 8*2($r_ptr)
3385	mov	$a3, 8*3($r_ptr)
3386___
3387}
3388$code.=<<___;
3389	`&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
3390	lea	$M(%rsp), $r_ptr
3391	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(M, M, Zsqr);
3392
3393	lea	$tmp0(%rsp), $r_ptr
3394	call	__ecp_nistz256_mul_by_2$x
3395
3396	lea	$M(%rsp), $b_ptr
3397	lea	$M(%rsp), $r_ptr
3398	call	__ecp_nistz256_add_to$x		# p256_mul_by_3(M, M);
3399
3400	`&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
3401	lea	$S(%rsp), $r_ptr
3402	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, in_x);
3403
3404	lea	$tmp0(%rsp), $r_ptr
3405	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(tmp0, S);
3406
3407	`&load_for_sqr("$M(%rsp)", "$src0")`
3408	movq	%xmm0, $r_ptr
3409	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_x, M);
3410
3411	lea	$tmp0(%rsp), $b_ptr
3412	mov	$acc6, $acc0			# harmonize sqr output and sub input
3413	mov	$acc7, $acc1
3414	mov	$a_ptr, $poly1
3415	mov	$t1, $poly3
3416	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, tmp0);
3417
3418	mov	$S+8*0(%rsp), $t0
3419	mov	$S+8*1(%rsp), $t1
3420	mov	$S+8*2(%rsp), $t2
3421	mov	$S+8*3(%rsp), $acc2		# "4-5-0-1" order
3422	lea	$S(%rsp), $r_ptr
3423	call	__ecp_nistz256_sub$x		# p256_sub(S, S, res_x);
3424
3425	mov	$M(%rsp), $src0
3426	lea	$M(%rsp), $b_ptr
3427	mov	$acc4, $acc6			# harmonize sub output and mul input
3428	xor	%ecx, %ecx
3429	mov	$acc4, $S+8*0(%rsp)		# have to save:-(
3430	mov	$acc5, $acc2
3431	mov	$acc5, $S+8*1(%rsp)
3432	cmovz	$acc0, $acc3
3433	mov	$acc0, $S+8*2(%rsp)
3434	lea	$S-$bias(%rsp), $a_ptr
3435	cmovz	$acc1, $acc4
3436	mov	$acc1, $S+8*3(%rsp)
3437	mov	$acc6, $acc1
3438	lea	$S(%rsp), $r_ptr
3439	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, M);
3440
3441	movq	%xmm1, $b_ptr
3442	movq	%xmm1, $r_ptr
3443	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, S, res_y);
3444
3445	lea	32*5+56(%rsp), %rsi
3446.cfi_def_cfa	%rsi,8
3447	mov	-48(%rsi),%r15
3448.cfi_restore	%r15
3449	mov	-40(%rsi),%r14
3450.cfi_restore	%r14
3451	mov	-32(%rsi),%r13
3452.cfi_restore	%r13
3453	mov	-24(%rsi),%r12
3454.cfi_restore	%r12
3455	mov	-16(%rsi),%rbx
3456.cfi_restore	%rbx
3457	mov	-8(%rsi),%rbp
3458.cfi_restore	%rbp
3459	lea	(%rsi),%rsp
3460.cfi_def_cfa_register	%rsp
3461.Lpoint_double${x}_epilogue:
3462	ret
3463.cfi_endproc
3464.size	ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
3465___
3466}
3467&gen_double("q");
3468
3469sub gen_add () {
3470    my $x = shift;
3471    my ($src0,$sfx,$bias);
3472    my ($H,$Hsqr,$R,$Rsqr,$Hcub,
3473	$U1,$U2,$S1,$S2,
3474	$res_x,$res_y,$res_z,
3475	$in1_x,$in1_y,$in1_z,
3476	$in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
3477    my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
3478
3479    if ($x ne "x") {
3480	$src0 = "%rax";
3481	$sfx  = "";
3482	$bias = 0;
3483
3484$code.=<<___;
3485.globl	ecp_nistz256_point_add
3486.type	ecp_nistz256_point_add,\@function,3
3487.align	32
3488ecp_nistz256_point_add:
3489.cfi_startproc
3490___
3491$code.=<<___	if ($addx);
3492	mov	\$0x80100, %ecx
3493	and	OPENSSL_ia32cap_P+8(%rip), %ecx
3494	cmp	\$0x80100, %ecx
3495	je	.Lpoint_addx
3496___
3497    } else {
3498	$src0 = "%rdx";
3499	$sfx  = "x";
3500	$bias = 128;
3501
3502$code.=<<___;
3503.type	ecp_nistz256_point_addx,\@function,3
3504.align	32
3505ecp_nistz256_point_addx:
3506.cfi_startproc
3507.Lpoint_addx:
3508___
3509    }
3510$code.=<<___;
3511	push	%rbp
3512.cfi_push	%rbp
3513	push	%rbx
3514.cfi_push	%rbx
3515	push	%r12
3516.cfi_push	%r12
3517	push	%r13
3518.cfi_push	%r13
3519	push	%r14
3520.cfi_push	%r14
3521	push	%r15
3522.cfi_push	%r15
3523	sub	\$32*18+8, %rsp
3524.cfi_adjust_cfa_offset	32*18+8
3525.Lpoint_add${x}_body:
3526
3527	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr
3528	movdqu	0x10($a_ptr), %xmm1
3529	movdqu	0x20($a_ptr), %xmm2
3530	movdqu	0x30($a_ptr), %xmm3
3531	movdqu	0x40($a_ptr), %xmm4
3532	movdqu	0x50($a_ptr), %xmm5
3533	mov	$a_ptr, $b_ptr			# reassign
3534	mov	$b_org, $a_ptr			# reassign
3535	movdqa	%xmm0, $in1_x(%rsp)
3536	movdqa	%xmm1, $in1_x+0x10(%rsp)
3537	movdqa	%xmm2, $in1_y(%rsp)
3538	movdqa	%xmm3, $in1_y+0x10(%rsp)
3539	movdqa	%xmm4, $in1_z(%rsp)
3540	movdqa	%xmm5, $in1_z+0x10(%rsp)
3541	por	%xmm4, %xmm5
3542
3543	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$b_ptr
3544	 pshufd	\$0xb1, %xmm5, %xmm3
3545	movdqu	0x10($a_ptr), %xmm1
3546	movdqu	0x20($a_ptr), %xmm2
3547	 por	%xmm3, %xmm5
3548	movdqu	0x30($a_ptr), %xmm3
3549	 mov	0x40+8*0($a_ptr), $src0		# load original in2_z
3550	 mov	0x40+8*1($a_ptr), $acc6
3551	 mov	0x40+8*2($a_ptr), $acc7
3552	 mov	0x40+8*3($a_ptr), $acc0
3553	movdqa	%xmm0, $in2_x(%rsp)
3554	 pshufd	\$0x1e, %xmm5, %xmm4
3555	movdqa	%xmm1, $in2_x+0x10(%rsp)
3556	movdqu	0x40($a_ptr),%xmm0		# in2_z again
3557	movdqu	0x50($a_ptr),%xmm1
3558	movdqa	%xmm2, $in2_y(%rsp)
3559	movdqa	%xmm3, $in2_y+0x10(%rsp)
3560	 por	%xmm4, %xmm5
3561	 pxor	%xmm4, %xmm4
3562	por	%xmm0, %xmm1
3563	 movq	$r_ptr, %xmm0			# save $r_ptr
3564
3565	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
3566	 mov	$src0, $in2_z+8*0(%rsp)		# make in2_z copy
3567	 mov	$acc6, $in2_z+8*1(%rsp)
3568	 mov	$acc7, $in2_z+8*2(%rsp)
3569	 mov	$acc0, $in2_z+8*3(%rsp)
3570	lea	$Z2sqr(%rsp), $r_ptr		# Z2^2
3571	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z2sqr, in2_z);
3572
3573	pcmpeqd	%xmm4, %xmm5
3574	pshufd	\$0xb1, %xmm1, %xmm4
3575	por	%xmm1, %xmm4
3576	pshufd	\$0, %xmm5, %xmm5		# in1infty
3577	pshufd	\$0x1e, %xmm4, %xmm3
3578	por	%xmm3, %xmm4
3579	pxor	%xmm3, %xmm3
3580	pcmpeqd	%xmm3, %xmm4
3581	pshufd	\$0, %xmm4, %xmm4		# in2infty
3582	 mov	0x40+8*0($b_ptr), $src0		# load original in1_z
3583	 mov	0x40+8*1($b_ptr), $acc6
3584	 mov	0x40+8*2($b_ptr), $acc7
3585	 mov	0x40+8*3($b_ptr), $acc0
3586	movq	$b_ptr, %xmm1
3587
3588	lea	0x40-$bias($b_ptr), $a_ptr
3589	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
3590	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
3591
3592	`&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
3593	lea	$S1(%rsp), $r_ptr		# S1 = Z2^3
3594	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, Z2sqr, in2_z);
3595
3596	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3597	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
3598	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
3599
3600	`&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
3601	lea	$S1(%rsp), $r_ptr		# S1 = Y1*Z2^3
3602	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, S1, in1_y);
3603
3604	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3605	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
3606	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
3607
3608	lea	$S1(%rsp), $b_ptr
3609	lea	$R(%rsp), $r_ptr		# R = S2 - S1
3610	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, S1);
3611
3612	or	$acc5, $acc4			# see if result is zero
3613	movdqa	%xmm4, %xmm2
3614	or	$acc0, $acc4
3615	or	$acc1, $acc4
3616	por	%xmm5, %xmm2			# in1infty || in2infty
3617	movq	$acc4, %xmm3
3618
3619	`&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3620	lea	$U1(%rsp), $r_ptr		# U1 = X1*Z2^2
3621	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U1, in1_x, Z2sqr);
3622
3623	`&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
3624	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
3625	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in2_x, Z1sqr);
3626
3627	lea	$U1(%rsp), $b_ptr
3628	lea	$H(%rsp), $r_ptr		# H = U2 - U1
3629	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, U1);
3630
3631	or	$acc5, $acc4			# see if result is zero
3632	or	$acc0, $acc4
3633	or	$acc1, $acc4			# !is_equal(U1, U2)
3634
3635	movq	%xmm2, $acc0			# in1infty | in2infty
3636	movq	%xmm3, $acc1			# !is_equal(S1, S2)
3637
3638	or	$acc0, $acc4
3639	or	$acc1, $acc4
3640
3641	# if (!is_equal(U1, U2) | in1infty | in2infty | !is_equal(S1, S2))
3642	.byte	0x3e				# predict taken
3643	jnz	.Ladd_proceed$x
3644
3645.Ladd_double$x:
3646	movq	%xmm1, $a_ptr			# restore $a_ptr
3647	movq	%xmm0, $r_ptr			# restore $r_ptr
3648	add	\$`32*(18-5)`, %rsp		# difference in frame sizes
3649.cfi_adjust_cfa_offset	`-32*(18-5)`
3650	jmp	.Lpoint_double_shortcut$x
3651.cfi_adjust_cfa_offset	`32*(18-5)`
3652
3653.align	32
3654.Ladd_proceed$x:
3655	`&load_for_sqr("$R(%rsp)", "$src0")`
3656	lea	$Rsqr(%rsp), $r_ptr		# R^2
3657	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
3658
3659	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3660	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3661	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
3662
3663	`&load_for_sqr("$H(%rsp)", "$src0")`
3664	lea	$Hsqr(%rsp), $r_ptr		# H^2
3665	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
3666
3667	`&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
3668	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3669	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, res_z, in2_z);
3670
3671	`&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
3672	lea	$Hcub(%rsp), $r_ptr		# H^3
3673	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
3674
3675	`&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
3676	lea	$U2(%rsp), $r_ptr		# U1*H^2
3677	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, U1, Hsqr);
3678___
3679{
3680#######################################################################
3681# operate in 4-5-0-1 "name space" that matches multiplication output
3682#
3683my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3684my ($poly1, $poly3)=($acc6,$acc7);
3685
3686$code.=<<___;
3687	#lea	$U2(%rsp), $a_ptr
3688	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
3689	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
3690
3691	xor	$t4, $t4
3692	add	$acc0, $acc0		# a0:a3+a0:a3
3693	lea	$Rsqr(%rsp), $a_ptr
3694	adc	$acc1, $acc1
3695	 mov	$acc0, $t0
3696	adc	$acc2, $acc2
3697	adc	$acc3, $acc3
3698	 mov	$acc1, $t1
3699	adc	\$0, $t4
3700
3701	sub	\$-1, $acc0
3702	 mov	$acc2, $t2
3703	sbb	$poly1, $acc1
3704	sbb	\$0, $acc2
3705	 mov	$acc3, $t3
3706	sbb	$poly3, $acc3
3707	sbb	\$0, $t4
3708
3709	cmovc	$t0, $acc0
3710	mov	8*0($a_ptr), $t0
3711	cmovc	$t1, $acc1
3712	mov	8*1($a_ptr), $t1
3713	cmovc	$t2, $acc2
3714	mov	8*2($a_ptr), $t2
3715	cmovc	$t3, $acc3
3716	mov	8*3($a_ptr), $t3
3717
3718	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
3719
3720	lea	$Hcub(%rsp), $b_ptr
3721	lea	$res_x(%rsp), $r_ptr
3722	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
3723
3724	mov	$U2+8*0(%rsp), $t0
3725	mov	$U2+8*1(%rsp), $t1
3726	mov	$U2+8*2(%rsp), $t2
3727	mov	$U2+8*3(%rsp), $t3
3728	lea	$res_y(%rsp), $r_ptr
3729
3730	call	__ecp_nistz256_sub$x		# p256_sub(res_y, U2, res_x);
3731
3732	mov	$acc0, 8*0($r_ptr)		# save the result, as
3733	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
3734	mov	$acc2, 8*2($r_ptr)
3735	mov	$acc3, 8*3($r_ptr)
3736___
3737}
3738$code.=<<___;
3739	`&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
3740	lea	$S2(%rsp), $r_ptr
3741	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S1, Hcub);
3742
3743	`&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
3744	lea	$res_y(%rsp), $r_ptr
3745	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_y, R, res_y);
3746
3747	lea	$S2(%rsp), $b_ptr
3748	lea	$res_y(%rsp), $r_ptr
3749	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, res_y, S2);
3750
3751	movq	%xmm0, $r_ptr		# restore $r_ptr
3752
3753	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, in2_z, in1infty);
3754	movdqa	%xmm5, %xmm1
3755	pandn	$res_z(%rsp), %xmm0
3756	movdqa	%xmm5, %xmm2
3757	pandn	$res_z+0x10(%rsp), %xmm1
3758	movdqa	%xmm5, %xmm3
3759	pand	$in2_z(%rsp), %xmm2
3760	pand	$in2_z+0x10(%rsp), %xmm3
3761	por	%xmm0, %xmm2
3762	por	%xmm1, %xmm3
3763
3764	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
3765	movdqa	%xmm4, %xmm1
3766	pandn	%xmm2, %xmm0
3767	movdqa	%xmm4, %xmm2
3768	pandn	%xmm3, %xmm1
3769	movdqa	%xmm4, %xmm3
3770	pand	$in1_z(%rsp), %xmm2
3771	pand	$in1_z+0x10(%rsp), %xmm3
3772	por	%xmm0, %xmm2
3773	por	%xmm1, %xmm3
3774	movdqu	%xmm2, 0x40($r_ptr)
3775	movdqu	%xmm3, 0x50($r_ptr)
3776
3777	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
3778	movdqa	%xmm5, %xmm1
3779	pandn	$res_x(%rsp), %xmm0
3780	movdqa	%xmm5, %xmm2
3781	pandn	$res_x+0x10(%rsp), %xmm1
3782	movdqa	%xmm5, %xmm3
3783	pand	$in2_x(%rsp), %xmm2
3784	pand	$in2_x+0x10(%rsp), %xmm3
3785	por	%xmm0, %xmm2
3786	por	%xmm1, %xmm3
3787
3788	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
3789	movdqa	%xmm4, %xmm1
3790	pandn	%xmm2, %xmm0
3791	movdqa	%xmm4, %xmm2
3792	pandn	%xmm3, %xmm1
3793	movdqa	%xmm4, %xmm3
3794	pand	$in1_x(%rsp), %xmm2
3795	pand	$in1_x+0x10(%rsp), %xmm3
3796	por	%xmm0, %xmm2
3797	por	%xmm1, %xmm3
3798	movdqu	%xmm2, 0x00($r_ptr)
3799	movdqu	%xmm3, 0x10($r_ptr)
3800
3801	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
3802	movdqa	%xmm5, %xmm1
3803	pandn	$res_y(%rsp), %xmm0
3804	movdqa	%xmm5, %xmm2
3805	pandn	$res_y+0x10(%rsp), %xmm1
3806	movdqa	%xmm5, %xmm3
3807	pand	$in2_y(%rsp), %xmm2
3808	pand	$in2_y+0x10(%rsp), %xmm3
3809	por	%xmm0, %xmm2
3810	por	%xmm1, %xmm3
3811
3812	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
3813	movdqa	%xmm4, %xmm1
3814	pandn	%xmm2, %xmm0
3815	movdqa	%xmm4, %xmm2
3816	pandn	%xmm3, %xmm1
3817	movdqa	%xmm4, %xmm3
3818	pand	$in1_y(%rsp), %xmm2
3819	pand	$in1_y+0x10(%rsp), %xmm3
3820	por	%xmm0, %xmm2
3821	por	%xmm1, %xmm3
3822	movdqu	%xmm2, 0x20($r_ptr)
3823	movdqu	%xmm3, 0x30($r_ptr)
3824
3825.Ladd_done$x:
3826	lea	32*18+56(%rsp), %rsi
3827.cfi_def_cfa	%rsi,8
3828	mov	-48(%rsi),%r15
3829.cfi_restore	%r15
3830	mov	-40(%rsi),%r14
3831.cfi_restore	%r14
3832	mov	-32(%rsi),%r13
3833.cfi_restore	%r13
3834	mov	-24(%rsi),%r12
3835.cfi_restore	%r12
3836	mov	-16(%rsi),%rbx
3837.cfi_restore	%rbx
3838	mov	-8(%rsi),%rbp
3839.cfi_restore	%rbp
3840	lea	(%rsi),%rsp
3841.cfi_def_cfa_register	%rsp
3842.Lpoint_add${x}_epilogue:
3843	ret
3844.cfi_endproc
3845.size	ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
3846___
3847}
3848&gen_add("q");
3849
3850sub gen_add_affine () {
3851    my $x = shift;
3852    my ($src0,$sfx,$bias);
3853    my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
3854	$res_x,$res_y,$res_z,
3855	$in1_x,$in1_y,$in1_z,
3856	$in2_x,$in2_y)=map(32*$_,(0..14));
3857    my $Z1sqr = $S2;
3858
3859    if ($x ne "x") {
3860	$src0 = "%rax";
3861	$sfx  = "";
3862	$bias = 0;
3863
3864$code.=<<___;
3865.globl	ecp_nistz256_point_add_affine
3866.type	ecp_nistz256_point_add_affine,\@function,3
3867.align	32
3868ecp_nistz256_point_add_affine:
3869.cfi_startproc
3870___
3871$code.=<<___	if ($addx);
3872	mov	\$0x80100, %ecx
3873	and	OPENSSL_ia32cap_P+8(%rip), %ecx
3874	cmp	\$0x80100, %ecx
3875	je	.Lpoint_add_affinex
3876___
3877    } else {
3878	$src0 = "%rdx";
3879	$sfx  = "x";
3880	$bias = 128;
3881
3882$code.=<<___;
3883.type	ecp_nistz256_point_add_affinex,\@function,3
3884.align	32
3885ecp_nistz256_point_add_affinex:
3886.cfi_startproc
3887.Lpoint_add_affinex:
3888___
3889    }
3890$code.=<<___;
3891	push	%rbp
3892.cfi_push	%rbp
3893	push	%rbx
3894.cfi_push	%rbx
3895	push	%r12
3896.cfi_push	%r12
3897	push	%r13
3898.cfi_push	%r13
3899	push	%r14
3900.cfi_push	%r14
3901	push	%r15
3902.cfi_push	%r15
3903	sub	\$32*15+8, %rsp
3904.cfi_adjust_cfa_offset	32*15+8
3905.Ladd_affine${x}_body:
3906
3907	movdqu	0x00($a_ptr), %xmm0	# copy	*(P256_POINT *)$a_ptr
3908	mov	$b_org, $b_ptr		# reassign
3909	movdqu	0x10($a_ptr), %xmm1
3910	movdqu	0x20($a_ptr), %xmm2
3911	movdqu	0x30($a_ptr), %xmm3
3912	movdqu	0x40($a_ptr), %xmm4
3913	movdqu	0x50($a_ptr), %xmm5
3914	 mov	0x40+8*0($a_ptr), $src0	# load original in1_z
3915	 mov	0x40+8*1($a_ptr), $acc6
3916	 mov	0x40+8*2($a_ptr), $acc7
3917	 mov	0x40+8*3($a_ptr), $acc0
3918	movdqa	%xmm0, $in1_x(%rsp)
3919	movdqa	%xmm1, $in1_x+0x10(%rsp)
3920	movdqa	%xmm2, $in1_y(%rsp)
3921	movdqa	%xmm3, $in1_y+0x10(%rsp)
3922	movdqa	%xmm4, $in1_z(%rsp)
3923	movdqa	%xmm5, $in1_z+0x10(%rsp)
3924	por	%xmm4, %xmm5
3925
3926	movdqu	0x00($b_ptr), %xmm0	# copy	*(P256_POINT_AFFINE *)$b_ptr
3927	 pshufd	\$0xb1, %xmm5, %xmm3
3928	movdqu	0x10($b_ptr), %xmm1
3929	movdqu	0x20($b_ptr), %xmm2
3930	 por	%xmm3, %xmm5
3931	movdqu	0x30($b_ptr), %xmm3
3932	movdqa	%xmm0, $in2_x(%rsp)
3933	 pshufd	\$0x1e, %xmm5, %xmm4
3934	movdqa	%xmm1, $in2_x+0x10(%rsp)
3935	por	%xmm0, %xmm1
3936	 movq	$r_ptr, %xmm0		# save $r_ptr
3937	movdqa	%xmm2, $in2_y(%rsp)
3938	movdqa	%xmm3, $in2_y+0x10(%rsp)
3939	por	%xmm2, %xmm3
3940	 por	%xmm4, %xmm5
3941	 pxor	%xmm4, %xmm4
3942	por	%xmm1, %xmm3
3943
3944	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
3945	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
3946	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
3947
3948	pcmpeqd	%xmm4, %xmm5
3949	pshufd	\$0xb1, %xmm3, %xmm4
3950	 mov	0x00($b_ptr), $src0		# $b_ptr is still valid
3951	 #lea	0x00($b_ptr), $b_ptr
3952	 mov	$acc4, $acc1			# harmonize sqr output and mul input
3953	por	%xmm3, %xmm4
3954	pshufd	\$0, %xmm5, %xmm5		# in1infty
3955	pshufd	\$0x1e, %xmm4, %xmm3
3956	 mov	$acc5, $acc2
3957	por	%xmm3, %xmm4
3958	pxor	%xmm3, %xmm3
3959	 mov	$acc6, $acc3
3960	pcmpeqd	%xmm3, %xmm4
3961	pshufd	\$0, %xmm4, %xmm4		# in2infty
3962
3963	lea	$Z1sqr-$bias(%rsp), $a_ptr
3964	mov	$acc7, $acc4
3965	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
3966	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, Z1sqr, in2_x);
3967
3968	lea	$in1_x(%rsp), $b_ptr
3969	lea	$H(%rsp), $r_ptr		# H = U2 - U1
3970	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, in1_x);
3971
3972	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3973	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
3974	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
3975
3976	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3977	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3978	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
3979
3980	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3981	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
3982	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
3983
3984	lea	$in1_y(%rsp), $b_ptr
3985	lea	$R(%rsp), $r_ptr		# R = S2 - S1
3986	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, in1_y);
3987
3988	`&load_for_sqr("$H(%rsp)", "$src0")`
3989	lea	$Hsqr(%rsp), $r_ptr		# H^2
3990	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
3991
3992	`&load_for_sqr("$R(%rsp)", "$src0")`
3993	lea	$Rsqr(%rsp), $r_ptr		# R^2
3994	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
3995
3996	`&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
3997	lea	$Hcub(%rsp), $r_ptr		# H^3
3998	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
3999
4000	`&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
4001	lea	$U2(%rsp), $r_ptr		# U1*H^2
4002	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in1_x, Hsqr);
4003___
4004{
4005#######################################################################
4006# operate in 4-5-0-1 "name space" that matches multiplication output
4007#
4008my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
4009my ($poly1, $poly3)=($acc6,$acc7);
4010
4011$code.=<<___;
4012	#lea	$U2(%rsp), $a_ptr
4013	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
4014	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
4015
4016	xor	$t4, $t4
4017	add	$acc0, $acc0		# a0:a3+a0:a3
4018	lea	$Rsqr(%rsp), $a_ptr
4019	adc	$acc1, $acc1
4020	 mov	$acc0, $t0
4021	adc	$acc2, $acc2
4022	adc	$acc3, $acc3
4023	 mov	$acc1, $t1
4024	adc	\$0, $t4
4025
4026	sub	\$-1, $acc0
4027	 mov	$acc2, $t2
4028	sbb	$poly1, $acc1
4029	sbb	\$0, $acc2
4030	 mov	$acc3, $t3
4031	sbb	$poly3, $acc3
4032	sbb	\$0, $t4
4033
4034	cmovc	$t0, $acc0
4035	mov	8*0($a_ptr), $t0
4036	cmovc	$t1, $acc1
4037	mov	8*1($a_ptr), $t1
4038	cmovc	$t2, $acc2
4039	mov	8*2($a_ptr), $t2
4040	cmovc	$t3, $acc3
4041	mov	8*3($a_ptr), $t3
4042
4043	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
4044
4045	lea	$Hcub(%rsp), $b_ptr
4046	lea	$res_x(%rsp), $r_ptr
4047	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
4048
4049	mov	$U2+8*0(%rsp), $t0
4050	mov	$U2+8*1(%rsp), $t1
4051	mov	$U2+8*2(%rsp), $t2
4052	mov	$U2+8*3(%rsp), $t3
4053	lea	$H(%rsp), $r_ptr
4054
4055	call	__ecp_nistz256_sub$x		# p256_sub(H, U2, res_x);
4056
4057	mov	$acc0, 8*0($r_ptr)		# save the result, as
4058	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
4059	mov	$acc2, 8*2($r_ptr)
4060	mov	$acc3, 8*3($r_ptr)
4061___
4062}
4063$code.=<<___;
4064	`&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
4065	lea	$S2(%rsp), $r_ptr
4066	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Hcub, in1_y);
4067
4068	`&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
4069	lea	$H(%rsp), $r_ptr
4070	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(H, H, R);
4071
4072	lea	$S2(%rsp), $b_ptr
4073	lea	$res_y(%rsp), $r_ptr
4074	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, H, S2);
4075
4076	movq	%xmm0, $r_ptr		# restore $r_ptr
4077
4078	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, ONE, in1infty);
4079	movdqa	%xmm5, %xmm1
4080	pandn	$res_z(%rsp), %xmm0
4081	movdqa	%xmm5, %xmm2
4082	pandn	$res_z+0x10(%rsp), %xmm1
4083	movdqa	%xmm5, %xmm3
4084	pand	.LONE_mont(%rip), %xmm2
4085	pand	.LONE_mont+0x10(%rip), %xmm3
4086	por	%xmm0, %xmm2
4087	por	%xmm1, %xmm3
4088
4089	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
4090	movdqa	%xmm4, %xmm1
4091	pandn	%xmm2, %xmm0
4092	movdqa	%xmm4, %xmm2
4093	pandn	%xmm3, %xmm1
4094	movdqa	%xmm4, %xmm3
4095	pand	$in1_z(%rsp), %xmm2
4096	pand	$in1_z+0x10(%rsp), %xmm3
4097	por	%xmm0, %xmm2
4098	por	%xmm1, %xmm3
4099	movdqu	%xmm2, 0x40($r_ptr)
4100	movdqu	%xmm3, 0x50($r_ptr)
4101
4102	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
4103	movdqa	%xmm5, %xmm1
4104	pandn	$res_x(%rsp), %xmm0
4105	movdqa	%xmm5, %xmm2
4106	pandn	$res_x+0x10(%rsp), %xmm1
4107	movdqa	%xmm5, %xmm3
4108	pand	$in2_x(%rsp), %xmm2
4109	pand	$in2_x+0x10(%rsp), %xmm3
4110	por	%xmm0, %xmm2
4111	por	%xmm1, %xmm3
4112
4113	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
4114	movdqa	%xmm4, %xmm1
4115	pandn	%xmm2, %xmm0
4116	movdqa	%xmm4, %xmm2
4117	pandn	%xmm3, %xmm1
4118	movdqa	%xmm4, %xmm3
4119	pand	$in1_x(%rsp), %xmm2
4120	pand	$in1_x+0x10(%rsp), %xmm3
4121	por	%xmm0, %xmm2
4122	por	%xmm1, %xmm3
4123	movdqu	%xmm2, 0x00($r_ptr)
4124	movdqu	%xmm3, 0x10($r_ptr)
4125
4126	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
4127	movdqa	%xmm5, %xmm1
4128	pandn	$res_y(%rsp), %xmm0
4129	movdqa	%xmm5, %xmm2
4130	pandn	$res_y+0x10(%rsp), %xmm1
4131	movdqa	%xmm5, %xmm3
4132	pand	$in2_y(%rsp), %xmm2
4133	pand	$in2_y+0x10(%rsp), %xmm3
4134	por	%xmm0, %xmm2
4135	por	%xmm1, %xmm3
4136
4137	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
4138	movdqa	%xmm4, %xmm1
4139	pandn	%xmm2, %xmm0
4140	movdqa	%xmm4, %xmm2
4141	pandn	%xmm3, %xmm1
4142	movdqa	%xmm4, %xmm3
4143	pand	$in1_y(%rsp), %xmm2
4144	pand	$in1_y+0x10(%rsp), %xmm3
4145	por	%xmm0, %xmm2
4146	por	%xmm1, %xmm3
4147	movdqu	%xmm2, 0x20($r_ptr)
4148	movdqu	%xmm3, 0x30($r_ptr)
4149
4150	lea	32*15+56(%rsp), %rsi
4151.cfi_def_cfa	%rsi,8
4152	mov	-48(%rsi),%r15
4153.cfi_restore	%r15
4154	mov	-40(%rsi),%r14
4155.cfi_restore	%r14
4156	mov	-32(%rsi),%r13
4157.cfi_restore	%r13
4158	mov	-24(%rsi),%r12
4159.cfi_restore	%r12
4160	mov	-16(%rsi),%rbx
4161.cfi_restore	%rbx
4162	mov	-8(%rsi),%rbp
4163.cfi_restore	%rbp
4164	lea	(%rsi),%rsp
4165.cfi_def_cfa_register	%rsp
4166.Ladd_affine${x}_epilogue:
4167	ret
4168.cfi_endproc
4169.size	ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
4170___
4171}
4172&gen_add_affine("q");
4173
4174########################################################################
4175# AD*X magic
4176#
4177if ($addx) {								{
4178########################################################################
4179# operate in 4-5-0-1 "name space" that matches multiplication output
4180#
4181my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
4182
4183$code.=<<___;
4184.type	__ecp_nistz256_add_tox,\@abi-omnipotent
4185.align	32
4186__ecp_nistz256_add_tox:
4187.cfi_startproc
4188	xor	$t4, $t4
4189	adc	8*0($b_ptr), $a0
4190	adc	8*1($b_ptr), $a1
4191	 mov	$a0, $t0
4192	adc	8*2($b_ptr), $a2
4193	adc	8*3($b_ptr), $a3
4194	 mov	$a1, $t1
4195	adc	\$0, $t4
4196
4197	xor	$t3, $t3
4198	sbb	\$-1, $a0
4199	 mov	$a2, $t2
4200	sbb	$poly1, $a1
4201	sbb	\$0, $a2
4202	 mov	$a3, $t3
4203	sbb	$poly3, $a3
4204	sbb	\$0, $t4
4205
4206	cmovc	$t0, $a0
4207	cmovc	$t1, $a1
4208	mov	$a0, 8*0($r_ptr)
4209	cmovc	$t2, $a2
4210	mov	$a1, 8*1($r_ptr)
4211	cmovc	$t3, $a3
4212	mov	$a2, 8*2($r_ptr)
4213	mov	$a3, 8*3($r_ptr)
4214
4215	ret
4216.cfi_endproc
4217.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
4218
4219.type	__ecp_nistz256_sub_fromx,\@abi-omnipotent
4220.align	32
4221__ecp_nistz256_sub_fromx:
4222.cfi_startproc
4223	xor	$t4, $t4
4224	sbb	8*0($b_ptr), $a0
4225	sbb	8*1($b_ptr), $a1
4226	 mov	$a0, $t0
4227	sbb	8*2($b_ptr), $a2
4228	sbb	8*3($b_ptr), $a3
4229	 mov	$a1, $t1
4230	sbb	\$0, $t4
4231
4232	xor	$t3, $t3
4233	adc	\$-1, $a0
4234	 mov	$a2, $t2
4235	adc	$poly1, $a1
4236	adc	\$0, $a2
4237	 mov	$a3, $t3
4238	adc	$poly3, $a3
4239
4240	bt	\$0, $t4
4241	cmovnc	$t0, $a0
4242	cmovnc	$t1, $a1
4243	mov	$a0, 8*0($r_ptr)
4244	cmovnc	$t2, $a2
4245	mov	$a1, 8*1($r_ptr)
4246	cmovnc	$t3, $a3
4247	mov	$a2, 8*2($r_ptr)
4248	mov	$a3, 8*3($r_ptr)
4249
4250	ret
4251.cfi_endproc
4252.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
4253
4254.type	__ecp_nistz256_subx,\@abi-omnipotent
4255.align	32
4256__ecp_nistz256_subx:
4257.cfi_startproc
4258	xor	$t4, $t4
4259	sbb	$a0, $t0
4260	sbb	$a1, $t1
4261	 mov	$t0, $a0
4262	sbb	$a2, $t2
4263	sbb	$a3, $t3
4264	 mov	$t1, $a1
4265	sbb	\$0, $t4
4266
4267	xor	$a3 ,$a3
4268	adc	\$-1, $t0
4269	 mov	$t2, $a2
4270	adc	$poly1, $t1
4271	adc	\$0, $t2
4272	 mov	$t3, $a3
4273	adc	$poly3, $t3
4274
4275	bt	\$0, $t4
4276	cmovc	$t0, $a0
4277	cmovc	$t1, $a1
4278	cmovc	$t2, $a2
4279	cmovc	$t3, $a3
4280
4281	ret
4282.cfi_endproc
4283.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
4284
4285.type	__ecp_nistz256_mul_by_2x,\@abi-omnipotent
4286.align	32
4287__ecp_nistz256_mul_by_2x:
4288.cfi_startproc
4289	xor	$t4, $t4
4290	adc	$a0, $a0		# a0:a3+a0:a3
4291	adc	$a1, $a1
4292	 mov	$a0, $t0
4293	adc	$a2, $a2
4294	adc	$a3, $a3
4295	 mov	$a1, $t1
4296	adc	\$0, $t4
4297
4298	xor	$t3, $t3
4299	sbb	\$-1, $a0
4300	 mov	$a2, $t2
4301	sbb	$poly1, $a1
4302	sbb	\$0, $a2
4303	 mov	$a3, $t3
4304	sbb	$poly3, $a3
4305	sbb	\$0, $t4
4306
4307	cmovc	$t0, $a0
4308	cmovc	$t1, $a1
4309	mov	$a0, 8*0($r_ptr)
4310	cmovc	$t2, $a2
4311	mov	$a1, 8*1($r_ptr)
4312	cmovc	$t3, $a3
4313	mov	$a2, 8*2($r_ptr)
4314	mov	$a3, 8*3($r_ptr)
4315
4316	ret
4317.cfi_endproc
4318.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
4319___
4320									}
4321&gen_double("x");
4322&gen_add("x");
4323&gen_add_affine("x");
4324}
4325}}}
4326
4327# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4328#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
4329if ($win64) {
4330$rec="%rcx";
4331$frame="%rdx";
4332$context="%r8";
4333$disp="%r9";
4334
4335$code.=<<___;
4336.extern	__imp_RtlVirtualUnwind
4337
4338.type	short_handler,\@abi-omnipotent
4339.align	16
4340short_handler:
4341	push	%rsi
4342	push	%rdi
4343	push	%rbx
4344	push	%rbp
4345	push	%r12
4346	push	%r13
4347	push	%r14
4348	push	%r15
4349	pushfq
4350	sub	\$64,%rsp
4351
4352	mov	120($context),%rax	# pull context->Rax
4353	mov	248($context),%rbx	# pull context->Rip
4354
4355	mov	8($disp),%rsi		# disp->ImageBase
4356	mov	56($disp),%r11		# disp->HandlerData
4357
4358	mov	0(%r11),%r10d		# HandlerData[0]
4359	lea	(%rsi,%r10),%r10	# end of prologue label
4360	cmp	%r10,%rbx		# context->Rip<end of prologue label
4361	jb	.Lcommon_seh_tail
4362
4363	mov	152($context),%rax	# pull context->Rsp
4364
4365	mov	4(%r11),%r10d		# HandlerData[1]
4366	lea	(%rsi,%r10),%r10	# epilogue label
4367	cmp	%r10,%rbx		# context->Rip>=epilogue label
4368	jae	.Lcommon_seh_tail
4369
4370	lea	16(%rax),%rax
4371
4372	mov	-8(%rax),%r12
4373	mov	-16(%rax),%r13
4374	mov	%r12,216($context)	# restore context->R12
4375	mov	%r13,224($context)	# restore context->R13
4376
4377	jmp	.Lcommon_seh_tail
4378.size	short_handler,.-short_handler
4379
4380.type	full_handler,\@abi-omnipotent
4381.align	16
4382full_handler:
4383	push	%rsi
4384	push	%rdi
4385	push	%rbx
4386	push	%rbp
4387	push	%r12
4388	push	%r13
4389	push	%r14
4390	push	%r15
4391	pushfq
4392	sub	\$64,%rsp
4393
4394	mov	120($context),%rax	# pull context->Rax
4395	mov	248($context),%rbx	# pull context->Rip
4396
4397	mov	8($disp),%rsi		# disp->ImageBase
4398	mov	56($disp),%r11		# disp->HandlerData
4399
4400	mov	0(%r11),%r10d		# HandlerData[0]
4401	lea	(%rsi,%r10),%r10	# end of prologue label
4402	cmp	%r10,%rbx		# context->Rip<end of prologue label
4403	jb	.Lcommon_seh_tail
4404
4405	mov	152($context),%rax	# pull context->Rsp
4406
4407	mov	4(%r11),%r10d		# HandlerData[1]
4408	lea	(%rsi,%r10),%r10	# epilogue label
4409	cmp	%r10,%rbx		# context->Rip>=epilogue label
4410	jae	.Lcommon_seh_tail
4411
4412	mov	8(%r11),%r10d		# HandlerData[2]
4413	lea	(%rax,%r10),%rax
4414
4415	mov	-8(%rax),%rbp
4416	mov	-16(%rax),%rbx
4417	mov	-24(%rax),%r12
4418	mov	-32(%rax),%r13
4419	mov	-40(%rax),%r14
4420	mov	-48(%rax),%r15
4421	mov	%rbx,144($context)	# restore context->Rbx
4422	mov	%rbp,160($context)	# restore context->Rbp
4423	mov	%r12,216($context)	# restore context->R12
4424	mov	%r13,224($context)	# restore context->R13
4425	mov	%r14,232($context)	# restore context->R14
4426	mov	%r15,240($context)	# restore context->R15
4427
4428.Lcommon_seh_tail:
4429	mov	8(%rax),%rdi
4430	mov	16(%rax),%rsi
4431	mov	%rax,152($context)	# restore context->Rsp
4432	mov	%rsi,168($context)	# restore context->Rsi
4433	mov	%rdi,176($context)	# restore context->Rdi
4434
4435	mov	40($disp),%rdi		# disp->ContextRecord
4436	mov	$context,%rsi		# context
4437	mov	\$154,%ecx		# sizeof(CONTEXT)
4438	.long	0xa548f3fc		# cld; rep movsq
4439
4440	mov	$disp,%rsi
4441	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
4442	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
4443	mov	0(%rsi),%r8		# arg3, disp->ControlPc
4444	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
4445	mov	40(%rsi),%r10		# disp->ContextRecord
4446	lea	56(%rsi),%r11		# &disp->HandlerData
4447	lea	24(%rsi),%r12		# &disp->EstablisherFrame
4448	mov	%r10,32(%rsp)		# arg5
4449	mov	%r11,40(%rsp)		# arg6
4450	mov	%r12,48(%rsp)		# arg7
4451	mov	%rcx,56(%rsp)		# arg8, (NULL)
4452	call	*__imp_RtlVirtualUnwind(%rip)
4453
4454	mov	\$1,%eax		# ExceptionContinueSearch
4455	add	\$64,%rsp
4456	popfq
4457	pop	%r15
4458	pop	%r14
4459	pop	%r13
4460	pop	%r12
4461	pop	%rbp
4462	pop	%rbx
4463	pop	%rdi
4464	pop	%rsi
4465	ret
4466.size	full_handler,.-full_handler
4467
4468.section	.pdata
4469.align	4
4470	.rva	.LSEH_begin_ecp_nistz256_mul_by_2
4471	.rva	.LSEH_end_ecp_nistz256_mul_by_2
4472	.rva	.LSEH_info_ecp_nistz256_mul_by_2
4473
4474	.rva	.LSEH_begin_ecp_nistz256_div_by_2
4475	.rva	.LSEH_end_ecp_nistz256_div_by_2
4476	.rva	.LSEH_info_ecp_nistz256_div_by_2
4477
4478	.rva	.LSEH_begin_ecp_nistz256_mul_by_3
4479	.rva	.LSEH_end_ecp_nistz256_mul_by_3
4480	.rva	.LSEH_info_ecp_nistz256_mul_by_3
4481
4482	.rva	.LSEH_begin_ecp_nistz256_add
4483	.rva	.LSEH_end_ecp_nistz256_add
4484	.rva	.LSEH_info_ecp_nistz256_add
4485
4486	.rva	.LSEH_begin_ecp_nistz256_sub
4487	.rva	.LSEH_end_ecp_nistz256_sub
4488	.rva	.LSEH_info_ecp_nistz256_sub
4489
4490	.rva	.LSEH_begin_ecp_nistz256_neg
4491	.rva	.LSEH_end_ecp_nistz256_neg
4492	.rva	.LSEH_info_ecp_nistz256_neg
4493
4494	.rva	.LSEH_begin_ecp_nistz256_ord_mul_mont
4495	.rva	.LSEH_end_ecp_nistz256_ord_mul_mont
4496	.rva	.LSEH_info_ecp_nistz256_ord_mul_mont
4497
4498	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_mont
4499	.rva	.LSEH_end_ecp_nistz256_ord_sqr_mont
4500	.rva	.LSEH_info_ecp_nistz256_ord_sqr_mont
4501___
4502$code.=<<___	if ($addx);
4503	.rva	.LSEH_begin_ecp_nistz256_ord_mul_montx
4504	.rva	.LSEH_end_ecp_nistz256_ord_mul_montx
4505	.rva	.LSEH_info_ecp_nistz256_ord_mul_montx
4506
4507	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_montx
4508	.rva	.LSEH_end_ecp_nistz256_ord_sqr_montx
4509	.rva	.LSEH_info_ecp_nistz256_ord_sqr_montx
4510___
4511$code.=<<___;
4512	.rva	.LSEH_begin_ecp_nistz256_to_mont
4513	.rva	.LSEH_end_ecp_nistz256_to_mont
4514	.rva	.LSEH_info_ecp_nistz256_to_mont
4515
4516	.rva	.LSEH_begin_ecp_nistz256_mul_mont
4517	.rva	.LSEH_end_ecp_nistz256_mul_mont
4518	.rva	.LSEH_info_ecp_nistz256_mul_mont
4519
4520	.rva	.LSEH_begin_ecp_nistz256_sqr_mont
4521	.rva	.LSEH_end_ecp_nistz256_sqr_mont
4522	.rva	.LSEH_info_ecp_nistz256_sqr_mont
4523
4524	.rva	.LSEH_begin_ecp_nistz256_from_mont
4525	.rva	.LSEH_end_ecp_nistz256_from_mont
4526	.rva	.LSEH_info_ecp_nistz256_from_mont
4527
4528	.rva	.LSEH_begin_ecp_nistz256_gather_w5
4529	.rva	.LSEH_end_ecp_nistz256_gather_w5
4530	.rva	.LSEH_info_ecp_nistz256_gather_wX
4531
4532	.rva	.LSEH_begin_ecp_nistz256_gather_w7
4533	.rva	.LSEH_end_ecp_nistz256_gather_w7
4534	.rva	.LSEH_info_ecp_nistz256_gather_wX
4535___
4536$code.=<<___	if ($avx>1);
4537	.rva	.LSEH_begin_ecp_nistz256_avx2_gather_w5
4538	.rva	.LSEH_end_ecp_nistz256_avx2_gather_w5
4539	.rva	.LSEH_info_ecp_nistz256_avx2_gather_wX
4540
4541	.rva	.LSEH_begin_ecp_nistz256_avx2_gather_w7
4542	.rva	.LSEH_end_ecp_nistz256_avx2_gather_w7
4543	.rva	.LSEH_info_ecp_nistz256_avx2_gather_wX
4544___
4545$code.=<<___;
4546	.rva	.LSEH_begin_ecp_nistz256_point_double
4547	.rva	.LSEH_end_ecp_nistz256_point_double
4548	.rva	.LSEH_info_ecp_nistz256_point_double
4549
4550	.rva	.LSEH_begin_ecp_nistz256_point_add
4551	.rva	.LSEH_end_ecp_nistz256_point_add
4552	.rva	.LSEH_info_ecp_nistz256_point_add
4553
4554	.rva	.LSEH_begin_ecp_nistz256_point_add_affine
4555	.rva	.LSEH_end_ecp_nistz256_point_add_affine
4556	.rva	.LSEH_info_ecp_nistz256_point_add_affine
4557___
4558$code.=<<___ if ($addx);
4559	.rva	.LSEH_begin_ecp_nistz256_point_doublex
4560	.rva	.LSEH_end_ecp_nistz256_point_doublex
4561	.rva	.LSEH_info_ecp_nistz256_point_doublex
4562
4563	.rva	.LSEH_begin_ecp_nistz256_point_addx
4564	.rva	.LSEH_end_ecp_nistz256_point_addx
4565	.rva	.LSEH_info_ecp_nistz256_point_addx
4566
4567	.rva	.LSEH_begin_ecp_nistz256_point_add_affinex
4568	.rva	.LSEH_end_ecp_nistz256_point_add_affinex
4569	.rva	.LSEH_info_ecp_nistz256_point_add_affinex
4570___
4571$code.=<<___;
4572
4573.section	.xdata
4574.align	8
4575.LSEH_info_ecp_nistz256_mul_by_2:
4576	.byte	9,0,0,0
4577	.rva	short_handler
4578	.rva	.Lmul_by_2_body,.Lmul_by_2_epilogue	# HandlerData[]
4579.LSEH_info_ecp_nistz256_div_by_2:
4580	.byte	9,0,0,0
4581	.rva	short_handler
4582	.rva	.Ldiv_by_2_body,.Ldiv_by_2_epilogue	# HandlerData[]
4583.LSEH_info_ecp_nistz256_mul_by_3:
4584	.byte	9,0,0,0
4585	.rva	short_handler
4586	.rva	.Lmul_by_3_body,.Lmul_by_3_epilogue	# HandlerData[]
4587.LSEH_info_ecp_nistz256_add:
4588	.byte	9,0,0,0
4589	.rva	short_handler
4590	.rva	.Ladd_body,.Ladd_epilogue		# HandlerData[]
4591.LSEH_info_ecp_nistz256_sub:
4592	.byte	9,0,0,0
4593	.rva	short_handler
4594	.rva	.Lsub_body,.Lsub_epilogue		# HandlerData[]
4595.LSEH_info_ecp_nistz256_neg:
4596	.byte	9,0,0,0
4597	.rva	short_handler
4598	.rva	.Lneg_body,.Lneg_epilogue		# HandlerData[]
4599.LSEH_info_ecp_nistz256_ord_mul_mont:
4600	.byte	9,0,0,0
4601	.rva	full_handler
4602	.rva	.Lord_mul_body,.Lord_mul_epilogue	# HandlerData[]
4603	.long	48,0
4604.LSEH_info_ecp_nistz256_ord_sqr_mont:
4605	.byte	9,0,0,0
4606	.rva	full_handler
4607	.rva	.Lord_sqr_body,.Lord_sqr_epilogue	# HandlerData[]
4608	.long	48,0
4609___
4610$code.=<<___ if ($addx);
4611.LSEH_info_ecp_nistz256_ord_mul_montx:
4612	.byte	9,0,0,0
4613	.rva	full_handler
4614	.rva	.Lord_mulx_body,.Lord_mulx_epilogue	# HandlerData[]
4615	.long	48,0
4616.LSEH_info_ecp_nistz256_ord_sqr_montx:
4617	.byte	9,0,0,0
4618	.rva	full_handler
4619	.rva	.Lord_sqrx_body,.Lord_sqrx_epilogue	# HandlerData[]
4620	.long	48,0
4621___
4622$code.=<<___;
4623.LSEH_info_ecp_nistz256_to_mont:
4624	.byte	9,0,0,0
4625	.rva	full_handler
4626	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
4627	.long	48,0
4628.LSEH_info_ecp_nistz256_mul_mont:
4629	.byte	9,0,0,0
4630	.rva	full_handler
4631	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
4632	.long	48,0
4633.LSEH_info_ecp_nistz256_sqr_mont:
4634	.byte	9,0,0,0
4635	.rva	full_handler
4636	.rva	.Lsqr_body,.Lsqr_epilogue		# HandlerData[]
4637	.long	48,0
4638.LSEH_info_ecp_nistz256_from_mont:
4639	.byte	9,0,0,0
4640	.rva	short_handler
4641	.rva	.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
4642.LSEH_info_ecp_nistz256_gather_wX:
4643	.byte	0x01,0x33,0x16,0x00
4644	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
4645	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
4646	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
4647	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
4648	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
4649	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
4650	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
4651	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
4652	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
4653	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
4654	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
4655	.align	8
4656___
4657$code.=<<___	if ($avx>1);
4658.LSEH_info_ecp_nistz256_avx2_gather_wX:
4659	.byte	0x01,0x36,0x17,0x0b
4660	.byte	0x36,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
4661	.byte	0x31,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
4662	.byte	0x2c,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
4663	.byte	0x27,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
4664	.byte	0x22,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
4665	.byte	0x1d,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
4666	.byte	0x18,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
4667	.byte	0x13,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
4668	.byte	0x0e,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
4669	.byte	0x09,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
4670	.byte	0x04,0x01,0x15,0x00	# sub	  rsp,0xa8
4671	.byte	0x00,0xb3,0x00,0x00	# set_frame r11
4672	.align	8
4673___
4674$code.=<<___;
4675.LSEH_info_ecp_nistz256_point_double:
4676	.byte	9,0,0,0
4677	.rva	full_handler
4678	.rva	.Lpoint_doubleq_body,.Lpoint_doubleq_epilogue	# HandlerData[]
4679	.long	32*5+56,0
4680.LSEH_info_ecp_nistz256_point_add:
4681	.byte	9,0,0,0
4682	.rva	full_handler
4683	.rva	.Lpoint_addq_body,.Lpoint_addq_epilogue		# HandlerData[]
4684	.long	32*18+56,0
4685.LSEH_info_ecp_nistz256_point_add_affine:
4686	.byte	9,0,0,0
4687	.rva	full_handler
4688	.rva	.Ladd_affineq_body,.Ladd_affineq_epilogue	# HandlerData[]
4689	.long	32*15+56,0
4690___
4691$code.=<<___ if ($addx);
4692.align	8
4693.LSEH_info_ecp_nistz256_point_doublex:
4694	.byte	9,0,0,0
4695	.rva	full_handler
4696	.rva	.Lpoint_doublex_body,.Lpoint_doublex_epilogue	# HandlerData[]
4697	.long	32*5+56,0
4698.LSEH_info_ecp_nistz256_point_addx:
4699	.byte	9,0,0,0
4700	.rva	full_handler
4701	.rva	.Lpoint_addx_body,.Lpoint_addx_epilogue		# HandlerData[]
4702	.long	32*18+56,0
4703.LSEH_info_ecp_nistz256_point_add_affinex:
4704	.byte	9,0,0,0
4705	.rva	full_handler
4706	.rva	.Ladd_affinex_body,.Ladd_affinex_epilogue	# HandlerData[]
4707	.long	32*15+56,0
4708___
4709}
4710
4711########################################################################
4712# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
4713#
4714open TABLE,"<ecp_nistz256_table.c"		or
4715open TABLE,"<${dir}../ecp_nistz256_table.c"	or
4716die "failed to open ecp_nistz256_table.c:",$!;
4717
4718use integer;
4719
4720foreach(<TABLE>) {
4721	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
4722}
4723close TABLE;
4724
4725die "insane number of elements" if ($#arr != 64*16*37-1);
4726
4727print <<___;
4728.section .rodata align=4096
4729.globl	ecp_nistz256_precomputed
4730.type	ecp_nistz256_precomputed,\@object
4731.align	4096
4732ecp_nistz256_precomputed:
4733___
4734while (@line=splice(@arr,0,16)) {
4735	print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n";
4736}
4737print <<___;
4738.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
4739___
4740
4741$code =~ s/\`([^\`]*)\`/eval $1/gem;
4742print $code;
4743close STDOUT or die "error closing STDOUT: $!";
4744