Lines Matching refs:T0

392 my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("xmm$_",(0..7));
431 &movdqa ($T0,$D2);
433 &pslld ($T0,2);
435 &paddd ($T0,$D2); # *5
437 &movdqa (&QWP(16*6,"esp"),$T0);
439 &movdqa ($T0,$D4);
441 &pslld ($T0,2);
443 &paddd ($T0,$D4); # *5
445 &movdqa (&QWP(16*8,"esp"),$T0);
448 &movdqa ($T0,$D1);
482 &movdqa ($T1,$T0);
483 &pmuludq ($T0,&QWP(16*3,$base)); # r1*h3
486 &paddq ($D4,$T0);
487 &movdqa ($T0,$T2);
491 &pmuludq ($T0,&QWP(16*0,$base)); # r1*h0
495 &paddq ($D1,$T0);
497 &movdqa ($T0,$T2);
500 &movdqa ($T1,$T0);
501 &pmuludq ($T0,&QWP(16*1,$base)); # r2*h1
505 &paddq ($D3,$T0);
506 &movdqa ($T0,$T2);
509 &pmuludq ($T0,&QWP(16*3,$base)); # s2*h3
515 &paddq ($D0,$T0);
516 &$load ($T0,7); # s3^n
519 &movdqa ($T1,$T0);
520 &pmuludq ($T0,&QWP(16*4,$base)); # s3*h4
524 &paddq ($D2,$T0);
526 &$load ($T0,4); # r4^n
530 &pmuludq ($T0,&QWP(16*0,$base)); # r4*h0
534 &paddq ($D4,$T0);
535 &movdqa ($T0,$T2);
538 &movdqa ($T1,$T0);
539 &pmuludq ($T0,&QWP(16*2,$base)); # s4*h2
543 &paddq ($D1,$T0);
559 &movdqa ($T0,$D3);
561 &psrlq ($T0,26);
563 &paddq ($T0,$D4); # h3 -> h4
567 &movdqa ($D4,$T0);
569 &psrlq ($T0,26);
573 &paddd ($D0,$T0); # favour paddd when
577 &psllq ($T0,2);
579 &paddq ($T0,$D0); # h4 -> h0 (*)
585 &movdqa ($D0,$T0);
586 &psrlq ($T0,26);
590 &paddd ($D1,$T0); # h0 -> h1
631 &movdqa ($T0,$D2);
633 &pslld ($T0,2);
635 &paddd ($T0,$D2); # *5
637 &movdqu (&QWP(16*6,"edi"),$T0);
639 &movdqa ($T0,$D4);
641 &pslld ($T0,2);
643 &paddd ($T0,$D4); # *5
645 &movdqu (&QWP(16*8,"edi"),$T0);
730 &movdqa ($T0,$T1); # -> base 2^26 ...
734 &movdqa ($T1,$T0);
735 &psrlq ($T0,26);
737 &pand ($T0,$MASK);
738 &paddd ($D1,$T0);
740 &movdqa ($T0,$T1);
745 &movdqa ($T1,$T0);
746 &psrlq ($T0,30);
747 &pand ($T0,$MASK);
749 &paddd ($D3,$T0);
751 &movd ($T0,"eax"); # padbit
754 &paddd ($D4,$T0);
772 &movd ($T0,&DWP(16*1+12,"edi")); # r1
793 &movdqu ($T0,&QWP(16*0,"edi")); # r^1:r^2:r^3:r^4
794 &pshufd ($T1,$T0,0b01000100); # duplicate r^3:r^4
796 &pshufd ($T0,$T0,0b11101110); # duplicate r^1:r^2
800 &movdqa (&QWP(16*(0-9),"edx"),$T0);
801 &pshufd ($T0,$T1,0b01000100);
803 &movdqa (&QWP(16*1,"edx"),$T0);
804 &movdqu ($T0,&QWP(16*2,"edi"));
806 &pshufd ($T1,$T0,0b01000100);
807 &pshufd ($T0,$T0,0b11101110);
810 &movdqa (&QWP(16*(2-9),"edx"),$T0);
811 &pshufd ($T0,$T1,0b01000100);
813 &movdqa (&QWP(16*3,"edx"),$T0);
814 &movdqu ($T0,&QWP(16*4,"edi"));
816 &pshufd ($T1,$T0,0b01000100);
817 &pshufd ($T0,$T0,0b11101110);
820 &movdqa (&QWP(16*(4-9),"edx"),$T0);
821 &pshufd ($T0,$T1,0b01000100);
823 &movdqa (&QWP(16*5,"edx"),$T0);
824 &movdqu ($T0,&QWP(16*6,"edi"));
826 &pshufd ($T1,$T0,0b01000100);
827 &pshufd ($T0,$T0,0b11101110);
830 &movdqa (&QWP(16*(6-9),"edx"),$T0);
831 &pshufd ($T0,$T1,0b01000100);
833 &movdqa (&QWP(16*7,"edx"),$T0);
834 &movdqu ($T0,&QWP(16*8,"edi"));
836 &pshufd ($T1,$T0,0b01000100);
837 &pshufd ($T0,$T0,0b11101110);
839 &movdqa (&QWP(16*(8-9),"edx"),$T0);
844 &movdqu ($T0,&QWP($inpbase+0,"esi")); # load input
852 &movdqa ($D2,$T0); # splat input
856 &movdqa ($D4,$T0);
859 &punpcklqdq ($T0,$T1); # 0:1
864 &movdqa ($T1,$T0);
867 &pand ($T0,$MASK); # 0
904 &movdqa ($D1,$T0);
905 &pmuludq ($T0,$T2); # h0*r0
918 &paddq ($D0,$T0);
919 &movdqa ($T0,$T2);
922 &movdqa ($T1,$T0);
923 &pmuludq ($T0,&$addr(3)); # h0*r3
927 &paddq ($D3,$T0);
929 &movdqa ($T0,$T2);
932 &movdqa ($T1,$T0);
933 &pmuludq ($T0,&$addr(2)); # h1*r2
937 &paddq ($D3,$T0);
938 &movdqa ($T0,$T2);
941 &movdqa ($T1,$T0);
942 &pmuludq ($T0,&$addr(8)); # h2*s4
947 &paddq ($D1,$T0);
948 &movdqa ($T0,&QWP(16*3,"eax")); # pull h3
951 &movdqa ($T1,$T0);
952 &pmuludq ($T0,&$addr(6)); # h3*s2
956 &paddq ($D0,$T0);
957 &movdqa ($T0,$T2);
962 &pmuludq ($T0,&$addr(1)); # h3*r1
966 &paddq ($D4,$T0);
967 &movdqa ($T0,$T2);
970 &movdqa ($T1,$T0);
971 &pmuludq ($T0,&$addr(6)); # h4*s2
975 &paddq ($D1,$T0);
984 &paddd ($T0,&QWP(16*(5+0),"esp")); # add hash value
1007 &movdqa ($D1,$T0);
1008 &pmuludq ($T0,$T2); # h0*r0
1009 &paddq ($T0,$D0);
1037 &paddd ($T0,$D0); # add hash value
1045 &movdqa (&QWP(16*0,"eax"),$T0);
1058 &pmuludq ($T0,$T2); # h0*r0
1061 &movdqa ($D0,$T0);
1062 &pshufd ($T0,&QWP(16*(1-9),"edx"),0x10);# r1^n
1076 &paddd ($T0,&QWP(16*5,"esp")); # add hash value
1085 &movdqa (&QWP(16*0,"esp"),$T0);
1086 &pmuludq ($T0,$T2); # h0*r0
1089 &paddq ($D0,$T0);
1090 &movdqa ($T0,$D2);
1096 &movdqa (&QWP(16*2,"esp"),$T0);
1097 &pshufd ($T0,&QWP(16*1,"edx"),0x10); # r1^n
1115 &pshufd ($T0,$D3,0b01001110);
1117 &paddq ($D3,$T0);
1119 &pshufd ($T0,$D1,0b01001110);
1121 &paddq ($D1,$T0);
1267 &vpslld ($T0,$D2,2);
1269 &vpaddd ($T0,$T0,$D2); # *5
1271 &vmovdqa (&QWP(16*6,"esp"),$T0);
1273 &vpslld ($T0,$D4,2);
1275 &vpaddd ($T0,$T0,$D4); # *5
1277 &vmovdqa (&QWP(16*8,"esp"),$T0);
1279 &vpshufd ($T0,$D0,0b01000100);
1285 &vmovdqa (&QWP(16*0,"edx"),$T0);
1302 &vpmuludq ($D0,$T0,$D0); # h0*r0
1304 &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # r1*h3
1305 &vpaddq ($D4,$D4,$T0);
1308 &vpmuludq ($T0,$T1,&QWP(16*1,"edx")); # r1*h1
1309 &vpaddq ($D2,$D2,$T0);
1313 &vmovdqa ($T0,&QWP(16*2,"esp")); # r2
1317 &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # r2*h2
1319 &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # r2*h1
1322 &vpmuludq ($T0,$T0,&QWP(16*0,"edx")); # r2*h0
1323 &vpaddq ($D2,$D2,$T0);
1326 &vmovdqa ($T0,&QWP(16*3,"esp")); # r3
1330 &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # r3*h1
1333 &vpmuludq ($T0,$T0,&QWP(16*0,"edx")); # r3*h0
1334 &vpaddq ($D3,$D3,$T0);
1337 &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # s3*h3
1338 &vpaddq ($D1,$D1,$T0);
1343 &vmovdqa ($T0,&QWP(16*8,"esp")); # s4
1346 &vpmuludq ($T1,$T0,&QWP(16*4,"edx")); # s4*h4
1348 &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # s4*h1
1350 &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # s4*h2
1353 &vpmuludq ($T0,$T0,&QWP(16*3,"edx")); # s4*h3
1354 &vpaddq ($D2,$D2,$T0);
1358 &vpsrlq ($T0,$D3,26);
1362 &vpaddq ($D4,$D4,$T0); # h3 -> h4
1364 &vpsrlq ($T0,$D4,26);
1369 &vpaddd ($D0,$D0,$T0);
1370 &vpsllq ($T0,$T0,2);
1373 &vpaddd ($D0,$D0,$T0); # h4 -> h0
1376 &vpsrlq ($T0,$D0,26);
1379 &vpaddd ($D1,$D1,$T0); # h0 -> h1
1417 &vpslld ($T0,$D2,2);
1419 &vpaddd ($T0,$T0,$D2); # *5
1421 &vmovdqu (&QWP(16*6,"edi"),$T0);
1423 &vpslld ($T0,$D4,2);
1425 &vpaddd ($T0,$T0,$D4); # *5
1427 &vmovdqu (&QWP(16*8,"edi"),$T0);
1437 my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("ymm$_",(0..7));
1561 &vmovdqu (&X($T0),&QWP(16*0,"esi"));
1568 &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1);
1591 &vmovdqu (&X($T0),&QWP(16*0,"esi")); # load input
1593 &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1);
1610 &vpsrldq ($D2,$T0,6); # splat input
1614 &vpunpckhqdq ($D1,$T0,$T1); # 4
1615 &vpunpcklqdq ($T0,$T0,$T1); # 0:1
1620 &vpsrlq ($T1,$T0,26);
1623 &vpand ($T0,$T0,$MASK); # 0
1637 &vpaddq ($T0,$T0,&QWP(32*0,"esp"));
1658 &vpmuludq ($T2,$T0,&$addr(3)); # h0*r3
1660 &vpmuludq ($T1,$T0,&$addr(4)); # h0*r4
1662 &vpmuludq ($T2,$T0,&$addr(0)); # h0*r0
1665 &vpmuludq ($T1,$T0,&$addr(1)); # h0*r1
1667 &vpmuludq ($T0,$T0,&$addr(2)); # h0*r2
1668 &vpaddq ($D2,$D2,$T0); # d2 += h0*r2
1672 &vpmuludq ($T0,$T2,&$addr(3)); # h1*r3
1673 &vpaddq ($D4,$D4,$T0); # d4 += h1*r3
1677 &vpmuludq ($T0,$T2,&$addr(0)); # h1*r0
1678 &vpaddq ($D1,$D1,$T0); # d1 += h1*r0
1682 &vpmuludq ($T0,$T1,&$addr(0)); # h3*r0
1683 &vpaddq ($D3,$D3,$T0); # d3 += h3*r0
1686 &vpmuludq ($T0,$T1,&$addr(6)); # h3*s2
1687 &vpaddq ($D0,$D0,$T0); # d0 += h3*s2
1688 &vmovdqa ($T0,&QWP(32*4,"esp")); # h4
1694 &vpmuludq ($T2,$T0,&$addr(8)); # h4*s4
1696 &vpmuludq ($T1,$T0,&$addr(5)); # h4*s1
1698 &vpmuludq ($T2,$T0,&$addr(0)); # h4*r0
1701 &vpmuludq ($T1,$T0,&$addr(6)); # h4*s2
1703 &vpmuludq ($T0,$T0,&$addr(7)); # h4*s3
1704 &vpaddq ($D2,$D2,$T0); # d2 += h4*s3
1712 &vpsrlq ($T0,$D3,26);
1716 &vpaddq ($D4,$D4,$T0); # h3 -> h4
1718 &vpsrlq ($T0,$D4,26);
1723 &vpaddq ($D0,$D0,$T0);
1724 &vpsllq ($T0,$T0,2);
1727 &vpaddq ($D0,$D0,$T0); # h4 -> h0
1730 &vpsrlq ($T0,$D0,26);
1733 &vpaddq ($D1,$D1,$T0); # h0 -> h1
1738 &vmovdqu (&X($T0),&QWP(16*0,"esi")); # load input
1740 &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1);
1755 &vpsrldq ($T0,$D4,8);
1757 &vpaddq ($D4,$D4,$T0);
1758 &vpsrldq ($T0,$D0,8);
1761 &vpaddq ($D0,$D0,$T0);
1762 &vpsrldq ($T0,$D2,8);
1765 &vpaddq ($D2,$D2,$T0);
1766 &vpermq ($T0,$D3,2);
1769 &vpaddq ($D3,$D3,$T0);
1770 &vpermq ($T0,$D1,2);
1773 &vpaddq ($D1,$D1,$T0);