xref: /openssl/crypto/bn/asm/bn-586.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 1995-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
11push(@INC,"${dir}","${dir}../../perlasm");
12require "x86asm.pl";
13
14$output = pop and open STDOUT,">$output";
15
16&asm_init($ARGV[0]);
17
18$sse2=0;
19for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
20
21&external_label("OPENSSL_ia32cap_P") if ($sse2);
22
23&bn_mul_add_words("bn_mul_add_words");
24&bn_mul_words("bn_mul_words");
25&bn_sqr_words("bn_sqr_words");
26&bn_div_words("bn_div_words");
27&bn_add_words("bn_add_words");
28&bn_sub_words("bn_sub_words");
29&bn_sub_part_words("bn_sub_part_words");
30
31&asm_finish();
32
33close STDOUT or die "error closing STDOUT: $!";
34
35sub bn_mul_add_words
36	{
37	local($name)=@_;
38
39	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
40
41	$r="eax";
42	$a="edx";
43	$c="ecx";
44
45	if ($sse2) {
46		&picmeup("eax","OPENSSL_ia32cap_P");
47		&bt(&DWP(0,"eax"),26);
48		&jnc(&label("maw_non_sse2"));
49
50		&mov($r,&wparam(0));
51		&mov($a,&wparam(1));
52		&mov($c,&wparam(2));
53		&movd("mm0",&wparam(3));	# mm0 = w
54		&pxor("mm1","mm1");		# mm1 = carry_in
55		&jmp(&label("maw_sse2_entry"));
56
57	&set_label("maw_sse2_unrolled",16);
58		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
59		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
60		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
61		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
62		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
63		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
64		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
65		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
66		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
67		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
68		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
69		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
70		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
71		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
72		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
73		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
74		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
75		&movd(&DWP(0,$r,"",0),"mm1");
76		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
77		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
78		&psrlq("mm1",32);		# mm1 = carry0
79		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
80		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
81		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
82		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
83		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
84		&movd(&DWP(4,$r,"",0),"mm1");
85		&psrlq("mm1",32);		# mm1 = carry1
86		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
87		&add($a,32);
88		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
89		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
90		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
91		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
92		&movd(&DWP(8,$r,"",0),"mm1");
93		&psrlq("mm1",32);		# mm1 = carry2
94		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
95		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
96		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
97		&movd(&DWP(12,$r,"",0),"mm1");
98		&psrlq("mm1",32);		# mm1 = carry3
99		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
100		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
101		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
102		&movd(&DWP(16,$r,"",0),"mm1");
103		&psrlq("mm1",32);		# mm1 = carry4
104		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
105		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
106		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
107		&movd(&DWP(20,$r,"",0),"mm1");
108		&psrlq("mm1",32);		# mm1 = carry5
109		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
110		&movd(&DWP(24,$r,"",0),"mm1");
111		&psrlq("mm1",32);		# mm1 = carry6
112		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
113		&movd(&DWP(28,$r,"",0),"mm1");
114		&lea($r,&DWP(32,$r));
115		&psrlq("mm1",32);		# mm1 = carry_out
116
117		&sub($c,8);
118		&jz(&label("maw_sse2_exit"));
119	&set_label("maw_sse2_entry");
120		&test($c,0xfffffff8);
121		&jnz(&label("maw_sse2_unrolled"));
122
123	&set_label("maw_sse2_loop",4);
124		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
125		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
126		&pmuludq("mm2","mm0");		# a[i] *= w
127		&lea($a,&DWP(4,$a));
128		&paddq("mm1","mm3");		# carry += r[i]
129		&paddq("mm1","mm2");		# carry += a[i]*w
130		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
131		&sub($c,1);
132		&psrlq("mm1",32);		# carry = carry_high
133		&lea($r,&DWP(4,$r));
134		&jnz(&label("maw_sse2_loop"));
135	&set_label("maw_sse2_exit");
136		&movd("eax","mm1");		# c = carry_out
137		&emms();
138		&ret();
139
140	&set_label("maw_non_sse2",16);
141	}
142
143	# function_begin prologue
144	&push("ebp");
145	&push("ebx");
146	&push("esi");
147	&push("edi");
148
149	&comment("");
150	$Low="eax";
151	$High="edx";
152	$a="ebx";
153	$w="ebp";
154	$r="edi";
155	$c="esi";
156
157	&xor($c,$c);		# clear carry
158	&mov($r,&wparam(0));	#
159
160	&mov("ecx",&wparam(2));	#
161	&mov($a,&wparam(1));	#
162
163	&and("ecx",0xfffffff8);	# num / 8
164	&mov($w,&wparam(3));	#
165
166	&push("ecx");		# Up the stack for a tmp variable
167
168	&jz(&label("maw_finish"));
169
170	&set_label("maw_loop",16);
171
172	for ($i=0; $i<32; $i+=4)
173		{
174		&comment("Round $i");
175
176		 &mov("eax",&DWP($i,$a)); 	# *a
177		&mul($w);			# *a * w
178		&add("eax",$c);			# L(t)+= c
179		&adc("edx",0);			# H(t)+=carry
180		 &add("eax",&DWP($i,$r));	# L(t)+= *r
181		&adc("edx",0);			# H(t)+=carry
182		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
183		&mov($c,"edx");			# c=  H(t);
184		}
185
186	&comment("");
187	&sub("ecx",8);
188	&lea($a,&DWP(32,$a));
189	&lea($r,&DWP(32,$r));
190	&jnz(&label("maw_loop"));
191
192	&set_label("maw_finish",0);
193	&mov("ecx",&wparam(2));	# get num
194	&and("ecx",7);
195	&jnz(&label("maw_finish2"));	# helps branch prediction
196	&jmp(&label("maw_end"));
197
198	&set_label("maw_finish2",1);
199	for ($i=0; $i<7; $i++)
200		{
201		&comment("Tail Round $i");
202		 &mov("eax",&DWP($i*4,$a));	# *a
203		&mul($w);			# *a * w
204		&add("eax",$c);			# L(t)+=c
205		&adc("edx",0);			# H(t)+=carry
206		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
207		&adc("edx",0);			# H(t)+=carry
208		 &dec("ecx") if ($i != 7-1);
209		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
210		 &mov($c,"edx");		# c=  H(t);
211		&jz(&label("maw_end")) if ($i != 7-1);
212		}
213	&set_label("maw_end",0);
214	&mov("eax",$c);
215
216	&pop("ecx");	# clear variable from
217
218	&function_end($name);
219	}
220
221sub bn_mul_words
222	{
223	local($name)=@_;
224
225	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
226
227	$r="eax";
228	$a="edx";
229	$c="ecx";
230
231	if ($sse2) {
232		&picmeup("eax","OPENSSL_ia32cap_P");
233		&bt(&DWP(0,"eax"),26);
234		&jnc(&label("mw_non_sse2"));
235
236		&mov($r,&wparam(0));
237		&mov($a,&wparam(1));
238		&mov($c,&wparam(2));
239		&movd("mm0",&wparam(3));	# mm0 = w
240		&pxor("mm1","mm1");		# mm1 = carry = 0
241
242	&set_label("mw_sse2_loop",16);
243		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
244		&pmuludq("mm2","mm0");		# a[i] *= w
245		&lea($a,&DWP(4,$a));
246		&paddq("mm1","mm2");		# carry += a[i]*w
247		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
248		&sub($c,1);
249		&psrlq("mm1",32);		# carry = carry_high
250		&lea($r,&DWP(4,$r));
251		&jnz(&label("mw_sse2_loop"));
252
253		&movd("eax","mm1");		# return carry
254		&emms();
255		&ret();
256	&set_label("mw_non_sse2",16);
257	}
258
259	# function_begin prologue
260	&push("ebp");
261	&push("ebx");
262	&push("esi");
263	&push("edi");
264
265	&comment("");
266	$Low="eax";
267	$High="edx";
268	$a="ebx";
269	$w="ecx";
270	$r="edi";
271	$c="esi";
272	$num="ebp";
273
274	&xor($c,$c);		# clear carry
275	&mov($r,&wparam(0));	#
276	&mov($a,&wparam(1));	#
277	&mov($num,&wparam(2));	#
278	&mov($w,&wparam(3));	#
279
280	&and($num,0xfffffff8);	# num / 8
281	&jz(&label("mw_finish"));
282
283	&set_label("mw_loop",0);
284	for ($i=0; $i<32; $i+=4)
285		{
286		&comment("Round $i");
287
288		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
289		&mul($w);			# *a * w
290		&add("eax",$c);			# L(t)+=c
291		 # XXX
292
293		&adc("edx",0);			# H(t)+=carry
294		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
295
296		&mov($c,"edx");			# c=  H(t);
297		}
298
299	&comment("");
300	&add($a,32);
301	&add($r,32);
302	&sub($num,8);
303	&jz(&label("mw_finish"));
304	&jmp(&label("mw_loop"));
305
306	&set_label("mw_finish",0);
307	&mov($num,&wparam(2));	# get num
308	&and($num,7);
309	&jnz(&label("mw_finish2"));
310	&jmp(&label("mw_end"));
311
312	&set_label("mw_finish2",1);
313	for ($i=0; $i<7; $i++)
314		{
315		&comment("Tail Round $i");
316		 &mov("eax",&DWP($i*4,$a,"",0));# *a
317		&mul($w);			# *a * w
318		&add("eax",$c);			# L(t)+=c
319		 # XXX
320		&adc("edx",0);			# H(t)+=carry
321		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
322		&mov($c,"edx");			# c=  H(t);
323		 &dec($num) if ($i != 7-1);
324		&jz(&label("mw_end")) if ($i != 7-1);
325		}
326	&set_label("mw_end",0);
327	&mov("eax",$c);
328
329	&function_end($name);
330	}
331
332sub bn_sqr_words
333	{
334	local($name)=@_;
335
336	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
337
338	$r="eax";
339	$a="edx";
340	$c="ecx";
341
342	if ($sse2) {
343		&picmeup("eax","OPENSSL_ia32cap_P");
344		&bt(&DWP(0,"eax"),26);
345		&jnc(&label("sqr_non_sse2"));
346
347		&mov($r,&wparam(0));
348		&mov($a,&wparam(1));
349		&mov($c,&wparam(2));
350
351	&set_label("sqr_sse2_loop",16);
352		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
353		&pmuludq("mm0","mm0");		# a[i] *= a[i]
354		&lea($a,&DWP(4,$a));		# a++
355		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
356		&sub($c,1);
357		&lea($r,&DWP(8,$r));		# r += 2
358		&jnz(&label("sqr_sse2_loop"));
359
360		&emms();
361		&ret();
362	&set_label("sqr_non_sse2",16);
363	}
364
365	# function_begin prologue
366	&push("ebp");
367	&push("ebx");
368	&push("esi");
369	&push("edi");
370
371	&comment("");
372	$r="esi";
373	$a="edi";
374	$num="ebx";
375
376	&mov($r,&wparam(0));	#
377	&mov($a,&wparam(1));	#
378	&mov($num,&wparam(2));	#
379
380	&and($num,0xfffffff8);	# num / 8
381	&jz(&label("sw_finish"));
382
383	&set_label("sw_loop",0);
384	for ($i=0; $i<32; $i+=4)
385		{
386		&comment("Round $i");
387		&mov("eax",&DWP($i,$a,"",0)); 	# *a
388		 # XXX
389		&mul("eax");			# *a * *a
390		&mov(&DWP($i*2,$r,"",0),"eax");	#
391		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
392		}
393
394	&comment("");
395	&add($a,32);
396	&add($r,64);
397	&sub($num,8);
398	&jnz(&label("sw_loop"));
399
400	&set_label("sw_finish",0);
401	&mov($num,&wparam(2));	# get num
402	&and($num,7);
403	&jz(&label("sw_end"));
404
405	for ($i=0; $i<7; $i++)
406		{
407		&comment("Tail Round $i");
408		&mov("eax",&DWP($i*4,$a,"",0));	# *a
409		 # XXX
410		&mul("eax");			# *a * *a
411		&mov(&DWP($i*8,$r,"",0),"eax");	#
412		 &dec($num) if ($i != 7-1);
413		&mov(&DWP($i*8+4,$r,"",0),"edx");
414		 &jz(&label("sw_end")) if ($i != 7-1);
415		}
416	&set_label("sw_end",0);
417
418	&function_end($name);
419	}
420
421sub bn_div_words
422	{
423	local($name)=@_;
424
425	&function_begin_B($name,"");
426	&mov("edx",&wparam(0));	#
427	&mov("eax",&wparam(1));	#
428	&mov("ecx",&wparam(2));	#
429	&div("ecx");
430	&ret();
431	&function_end_B($name);
432	}
433
434sub bn_add_words
435	{
436	local($name)=@_;
437
438	&function_begin($name,"");
439
440	&comment("");
441	$a="esi";
442	$b="edi";
443	$c="eax";
444	$r="ebx";
445	$tmp1="ecx";
446	$tmp2="edx";
447	$num="ebp";
448
449	&mov($r,&wparam(0));	# get r
450	 &mov($a,&wparam(1));	# get a
451	&mov($b,&wparam(2));	# get b
452	 &mov($num,&wparam(3));	# get num
453	&xor($c,$c);		# clear carry
454	 &and($num,0xfffffff8);	# num / 8
455
456	&jz(&label("aw_finish"));
457
458	&set_label("aw_loop",0);
459	for ($i=0; $i<8; $i++)
460		{
461		&comment("Round $i");
462
463		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
464		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
465		&add($tmp1,$c);
466		 &mov($c,0);
467		&adc($c,$c);
468		 &add($tmp1,$tmp2);
469		&adc($c,0);
470		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
471		}
472
473	&comment("");
474	&add($a,32);
475	 &add($b,32);
476	&add($r,32);
477	 &sub($num,8);
478	&jnz(&label("aw_loop"));
479
480	&set_label("aw_finish",0);
481	&mov($num,&wparam(3));	# get num
482	&and($num,7);
483	 &jz(&label("aw_end"));
484
485	for ($i=0; $i<7; $i++)
486		{
487		&comment("Tail Round $i");
488		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
489		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
490		&add($tmp1,$c);
491		 &mov($c,0);
492		&adc($c,$c);
493		 &add($tmp1,$tmp2);
494		&adc($c,0);
495		 &dec($num) if ($i != 6);
496		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
497		 &jz(&label("aw_end")) if ($i != 6);
498		}
499	&set_label("aw_end",0);
500
501#	&mov("eax",$c);		# $c is "eax"
502
503	&function_end($name);
504	}
505
506sub bn_sub_words
507	{
508	local($name)=@_;
509
510	&function_begin($name,"");
511
512	&comment("");
513	$a="esi";
514	$b="edi";
515	$c="eax";
516	$r="ebx";
517	$tmp1="ecx";
518	$tmp2="edx";
519	$num="ebp";
520
521	&mov($r,&wparam(0));	# get r
522	 &mov($a,&wparam(1));	# get a
523	&mov($b,&wparam(2));	# get b
524	 &mov($num,&wparam(3));	# get num
525	&xor($c,$c);		# clear carry
526	 &and($num,0xfffffff8);	# num / 8
527
528	&jz(&label("aw_finish"));
529
530	&set_label("aw_loop",0);
531	for ($i=0; $i<8; $i++)
532		{
533		&comment("Round $i");
534
535		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
536		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
537		&sub($tmp1,$c);
538		 &mov($c,0);
539		&adc($c,$c);
540		 &sub($tmp1,$tmp2);
541		&adc($c,0);
542		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
543		}
544
545	&comment("");
546	&add($a,32);
547	 &add($b,32);
548	&add($r,32);
549	 &sub($num,8);
550	&jnz(&label("aw_loop"));
551
552	&set_label("aw_finish",0);
553	&mov($num,&wparam(3));	# get num
554	&and($num,7);
555	 &jz(&label("aw_end"));
556
557	for ($i=0; $i<7; $i++)
558		{
559		&comment("Tail Round $i");
560		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
561		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
562		&sub($tmp1,$c);
563		 &mov($c,0);
564		&adc($c,$c);
565		 &sub($tmp1,$tmp2);
566		&adc($c,0);
567		 &dec($num) if ($i != 6);
568		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
569		 &jz(&label("aw_end")) if ($i != 6);
570		}
571	&set_label("aw_end",0);
572
573#	&mov("eax",$c);		# $c is "eax"
574
575	&function_end($name);
576	}
577
578sub bn_sub_part_words
579	{
580	local($name)=@_;
581
582	&function_begin($name,"");
583
584	&comment("");
585	$a="esi";
586	$b="edi";
587	$c="eax";
588	$r="ebx";
589	$tmp1="ecx";
590	$tmp2="edx";
591	$num="ebp";
592
593	&mov($r,&wparam(0));	# get r
594	 &mov($a,&wparam(1));	# get a
595	&mov($b,&wparam(2));	# get b
596	 &mov($num,&wparam(3));	# get num
597	&xor($c,$c);		# clear carry
598	 &and($num,0xfffffff8);	# num / 8
599
600	&jz(&label("aw_finish"));
601
602	&set_label("aw_loop",0);
603	for ($i=0; $i<8; $i++)
604		{
605		&comment("Round $i");
606
607		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
608		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
609		&sub($tmp1,$c);
610		 &mov($c,0);
611		&adc($c,$c);
612		 &sub($tmp1,$tmp2);
613		&adc($c,0);
614		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
615		}
616
617	&comment("");
618	&add($a,32);
619	 &add($b,32);
620	&add($r,32);
621	 &sub($num,8);
622	&jnz(&label("aw_loop"));
623
624	&set_label("aw_finish",0);
625	&mov($num,&wparam(3));	# get num
626	&and($num,7);
627	 &jz(&label("aw_end"));
628
629	for ($i=0; $i<7; $i++)
630		{
631		&comment("Tail Round $i");
632		&mov($tmp1,&DWP(0,$a,"",0));	# *a
633		 &mov($tmp2,&DWP(0,$b,"",0));# *b
634		&sub($tmp1,$c);
635		 &mov($c,0);
636		&adc($c,$c);
637		 &sub($tmp1,$tmp2);
638		&adc($c,0);
639		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
640		&add($a, 4);
641		&add($b, 4);
642		&add($r, 4);
643		 &dec($num) if ($i != 6);
644		 &jz(&label("aw_end")) if ($i != 6);
645		}
646	&set_label("aw_end",0);
647
648	&cmp(&wparam(4),0);
649	&je(&label("pw_end"));
650
651	&mov($num,&wparam(4));	# get dl
652	&cmp($num,0);
653	&je(&label("pw_end"));
654	&jge(&label("pw_pos"));
655
656	&comment("pw_neg");
657	&mov($tmp2,0);
658	&sub($tmp2,$num);
659	&mov($num,$tmp2);
660	&and($num,0xfffffff8);	# num / 8
661	&jz(&label("pw_neg_finish"));
662
663	&set_label("pw_neg_loop",0);
664	for ($i=0; $i<8; $i++)
665	{
666	    &comment("dl<0 Round $i");
667
668	    &mov($tmp1,0);
669	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
670	    &sub($tmp1,$c);
671	    &mov($c,0);
672	    &adc($c,$c);
673	    &sub($tmp1,$tmp2);
674	    &adc($c,0);
675	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
676	}
677
678	&comment("");
679	&add($b,32);
680	&add($r,32);
681	&sub($num,8);
682	&jnz(&label("pw_neg_loop"));
683
684	&set_label("pw_neg_finish",0);
685	&mov($tmp2,&wparam(4));	# get dl
686	&mov($num,0);
687	&sub($num,$tmp2);
688	&and($num,7);
689	&jz(&label("pw_end"));
690
691	for ($i=0; $i<7; $i++)
692	{
693	    &comment("dl<0 Tail Round $i");
694	    &mov($tmp1,0);
695	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
696	    &sub($tmp1,$c);
697	    &mov($c,0);
698	    &adc($c,$c);
699	    &sub($tmp1,$tmp2);
700	    &adc($c,0);
701	    &dec($num) if ($i != 6);
702	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
703	    &jz(&label("pw_end")) if ($i != 6);
704	}
705
706	&jmp(&label("pw_end"));
707
708	&set_label("pw_pos",0);
709
710	&and($num,0xfffffff8);	# num / 8
711	&jz(&label("pw_pos_finish"));
712
713	&set_label("pw_pos_loop",0);
714
715	for ($i=0; $i<8; $i++)
716	{
717	    &comment("dl>0 Round $i");
718
719	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
720	    &sub($tmp1,$c);
721	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
722	    &jnc(&label("pw_nc".$i));
723	}
724
725	&comment("");
726	&add($a,32);
727	&add($r,32);
728	&sub($num,8);
729	&jnz(&label("pw_pos_loop"));
730
731	&set_label("pw_pos_finish",0);
732	&mov($num,&wparam(4));	# get dl
733	&and($num,7);
734	&jz(&label("pw_end"));
735
736	for ($i=0; $i<7; $i++)
737	{
738	    &comment("dl>0 Tail Round $i");
739	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
740	    &sub($tmp1,$c);
741	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
742	    &jnc(&label("pw_tail_nc".$i));
743	    &dec($num) if ($i != 6);
744	    &jz(&label("pw_end")) if ($i != 6);
745	}
746	&mov($c,1);
747	&jmp(&label("pw_end"));
748
749	&set_label("pw_nc_loop",0);
750	for ($i=0; $i<8; $i++)
751	{
752	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
753	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
754	    &set_label("pw_nc".$i,0);
755	}
756
757	&comment("");
758	&add($a,32);
759	&add($r,32);
760	&sub($num,8);
761	&jnz(&label("pw_nc_loop"));
762
763	&mov($num,&wparam(4));	# get dl
764	&and($num,7);
765	&jz(&label("pw_nc_end"));
766
767	for ($i=0; $i<7; $i++)
768	{
769	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
770	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
771	    &set_label("pw_tail_nc".$i,0);
772	    &dec($num) if ($i != 6);
773	    &jz(&label("pw_nc_end")) if ($i != 6);
774	}
775
776	&set_label("pw_nc_end",0);
777	&mov($c,0);
778
779	&set_label("pw_end",0);
780
781#	&mov("eax",$c);		# $c is "eax"
782
783	&function_end($name);
784	}
785