1#! /usr/bin/env perl 2# Copyright 2016-2023 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# December 2014 18# 19# ChaCha20 for ARMv4. 20# 21# Performance in cycles per byte out of large buffer. 22# 23# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU 24# 25# Cortex-A5 19.3(*)/+95% 21.8 14.1 26# Cortex-A8 10.5(*)/+160% 13.9 6.35 27# Cortex-A9 12.9(**)/+110% 14.3 6.50 28# Cortex-A15 11.0/+40% 16.0 5.00 29# Snapdragon S4 11.5/+125% 13.6 4.90 30# 31# (*) most "favourable" result for aligned data on little-endian 32# processor, result for misaligned data is 10-15% lower; 33# (**) this result is a trade-off: it can be improved by 20%, 34# but then Snapdragon S4 and Cortex-A8 results get 35# 20-25% worse; 36 37# $output is the last argument if it looks like a file (it has an extension) 38# $flavour is the first argument if it doesn't look like a file 39$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 40$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 41 42if ($flavour && $flavour ne "void") { 43 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 44 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 45 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 46 die "can't locate arm-xlate.pl"; 47 48 open STDOUT,"| \"$^X\" $xlate $flavour $output" 49 or die "can't call $xlate: $!"; 50} else { 51 $output and open STDOUT,">$output"; 52} 53 54sub AUTOLOAD() # thunk [simplified] x86-style perlasm 55{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 56 my $arg = pop; 57 $arg = "#$arg" if ($arg*1 eq $arg); 58 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 59} 60 61my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); 62my @t=map("r$_",(8..11)); 63 64sub ROUND { 65my ($a0,$b0,$c0,$d0)=@_; 66my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 67my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 68my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 69my $odd = $d0&1; 70my ($xc,$xc_) = (@t[0..1]); 71my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); 72my @ret; 73 74 # Consider order in which variables are addressed by their 75 # index: 76 # 77 # a b c d 78 # 79 # 0 4 8 12 < even round 80 # 1 5 9 13 81 # 2 6 10 14 82 # 3 7 11 15 83 # 0 5 10 15 < odd round 84 # 1 6 11 12 85 # 2 7 8 13 86 # 3 4 9 14 87 # 88 # 'a', 'b' are permanently allocated in registers, @x[0..7], 89 # while 'c's and pair of 'd's are maintained in memory. If 90 # you observe 'c' column, you'll notice that pair of 'c's is 91 # invariant between rounds. This means that we have to reload 92 # them once per round, in the middle. This is why you'll see 93 # bunch of 'c' stores and loads in the middle, but none in 94 # the beginning or end. If you observe 'd' column, you'll 95 # notice that 15 and 13 are reused in next pair of rounds. 96 # This is why these two are chosen for offloading to memory, 97 # to make loads count more. 98 push @ret,( 99 "&add (@x[$a0],@x[$a0],@x[$b0])", 100 "&mov ($xd,$xd,'ror#16')", 101 "&add (@x[$a1],@x[$a1],@x[$b1])", 102 "&mov ($xd_,$xd_,'ror#16')", 103 "&eor ($xd,$xd,@x[$a0],'ror#16')", 104 "&eor ($xd_,$xd_,@x[$a1],'ror#16')", 105 106 "&add ($xc,$xc,$xd)", 107 "&mov (@x[$b0],@x[$b0],'ror#20')", 108 "&add ($xc_,$xc_,$xd_)", 109 "&mov (@x[$b1],@x[$b1],'ror#20')", 110 "&eor (@x[$b0],@x[$b0],$xc,'ror#20')", 111 "&eor (@x[$b1],@x[$b1],$xc_,'ror#20')", 112 113 "&add (@x[$a0],@x[$a0],@x[$b0])", 114 "&mov ($xd,$xd,'ror#24')", 115 "&add (@x[$a1],@x[$a1],@x[$b1])", 116 "&mov ($xd_,$xd_,'ror#24')", 117 "&eor ($xd,$xd,@x[$a0],'ror#24')", 118 "&eor ($xd_,$xd_,@x[$a1],'ror#24')", 119 120 "&add ($xc,$xc,$xd)", 121 "&mov (@x[$b0],@x[$b0],'ror#25')" ); 122 push @ret,( 123 "&str ($xd,'[sp,#4*(16+$d0)]')", 124 "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); 125 push @ret,( 126 "&add ($xc_,$xc_,$xd_)", 127 "&mov (@x[$b1],@x[$b1],'ror#25')" ); 128 push @ret,( 129 "&str ($xd_,'[sp,#4*(16+$d1)]')", 130 "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); 131 push @ret,( 132 "&eor (@x[$b0],@x[$b0],$xc,'ror#25')", 133 "&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" ); 134 135 $xd=@x[$d2] if (!$odd); 136 $xd_=@x[$d3] if ($odd); 137 push @ret,( 138 "&str ($xc,'[sp,#4*(16+$c0)]')", 139 "&ldr ($xc,'[sp,#4*(16+$c2)]')", 140 "&add (@x[$a2],@x[$a2],@x[$b2])", 141 "&mov ($xd,$xd,'ror#16')", 142 "&str ($xc_,'[sp,#4*(16+$c1)]')", 143 "&ldr ($xc_,'[sp,#4*(16+$c3)]')", 144 "&add (@x[$a3],@x[$a3],@x[$b3])", 145 "&mov ($xd_,$xd_,'ror#16')", 146 "&eor ($xd,$xd,@x[$a2],'ror#16')", 147 "&eor ($xd_,$xd_,@x[$a3],'ror#16')", 148 149 "&add ($xc,$xc,$xd)", 150 "&mov (@x[$b2],@x[$b2],'ror#20')", 151 "&add ($xc_,$xc_,$xd_)", 152 "&mov (@x[$b3],@x[$b3],'ror#20')", 153 "&eor (@x[$b2],@x[$b2],$xc,'ror#20')", 154 "&eor (@x[$b3],@x[$b3],$xc_,'ror#20')", 155 156 "&add (@x[$a2],@x[$a2],@x[$b2])", 157 "&mov ($xd,$xd,'ror#24')", 158 "&add (@x[$a3],@x[$a3],@x[$b3])", 159 "&mov ($xd_,$xd_,'ror#24')", 160 "&eor ($xd,$xd,@x[$a2],'ror#24')", 161 "&eor ($xd_,$xd_,@x[$a3],'ror#24')", 162 163 "&add ($xc,$xc,$xd)", 164 "&mov (@x[$b2],@x[$b2],'ror#25')", 165 "&add ($xc_,$xc_,$xd_)", 166 "&mov (@x[$b3],@x[$b3],'ror#25')", 167 "&eor (@x[$b2],@x[$b2],$xc,'ror#25')", 168 "&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" ); 169 170 @ret; 171} 172 173$code.=<<___; 174#include "arm_arch.h" 175 176#if defined(__thumb2__) || defined(__clang__) 177.syntax unified 178#endif 179#if defined(__thumb2__) 180.thumb 181#else 182.code 32 183#endif 184 185#if defined(__thumb2__) || defined(__clang__) 186#define ldrhsb ldrbhs 187#endif 188 189.text 190 191.align 5 192.Lsigma: 193.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral 194.Lone: 195.long 1,0,0,0 196#if __ARM_MAX_ARCH__>=7 197.LOPENSSL_armcap: 198# ifdef _WIN32 199.word OPENSSL_armcap_P 200# else 201.word OPENSSL_armcap_P-.LChaCha20_ctr32 202# endif 203#else 204.word -1 205#endif 206 207.globl ChaCha20_ctr32 208.type ChaCha20_ctr32,%function 209.align 5 210ChaCha20_ctr32: 211.LChaCha20_ctr32: 212 ldr r12,[sp,#0] @ pull pointer to counter and nonce 213 stmdb sp!,{r0-r2,r4-r11,lr} 214#if __ARM_ARCH__<7 && !defined(__thumb2__) 215 sub r14,pc,#16 @ ChaCha20_ctr32 216#else 217 adr r14,.LChaCha20_ctr32 218#endif 219 cmp r2,#0 @ len==0? 220#ifdef __thumb2__ 221 itt eq 222#endif 223 addeq sp,sp,#4*3 224 beq .Lno_data 225#if __ARM_MAX_ARCH__>=7 226 cmp r2,#192 @ test len 227 bls .Lshort 228 ldr r4,[r14,#-32] 229# if !defined(_WIN32) 230 ldr r4,[r14,r4] 231# endif 232# if defined(__APPLE__) || defined(_WIN32) 233 ldr r4,[r4] 234# endif 235 tst r4,#ARMV7_NEON 236 bne .LChaCha20_neon 237.Lshort: 238#endif 239 ldmia r12,{r4-r7} @ load counter and nonce 240 sub sp,sp,#4*(16) @ off-load area 241 sub r14,r14,#64 @ .Lsigma 242 stmdb sp!,{r4-r7} @ copy counter and nonce 243 ldmia r3,{r4-r11} @ load key 244 ldmia r14,{r0-r3} @ load sigma 245 stmdb sp!,{r4-r11} @ copy key 246 stmdb sp!,{r0-r3} @ copy sigma 247 str r10,[sp,#4*(16+10)] @ off-load "@x[10]" 248 str r11,[sp,#4*(16+11)] @ off-load "@x[11]" 249 b .Loop_outer_enter 250 251.align 4 252.Loop_outer: 253 ldmia sp,{r0-r9} @ load key material 254 str @t[3],[sp,#4*(32+2)] @ save len 255 str r12, [sp,#4*(32+1)] @ save inp 256 str r14, [sp,#4*(32+0)] @ save out 257.Loop_outer_enter: 258 ldr @t[3], [sp,#4*(15)] 259 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 260 ldr @t[2], [sp,#4*(13)] 261 ldr @x[14],[sp,#4*(14)] 262 str @t[3], [sp,#4*(16+15)] 263 mov @t[3],#10 264 b .Loop 265 266.align 4 267.Loop: 268 subs @t[3],@t[3],#1 269___ 270 foreach (&ROUND(0, 4, 8,12)) { eval; } 271 foreach (&ROUND(0, 5,10,15)) { eval; } 272$code.=<<___; 273 bne .Loop 274 275 ldr @t[3],[sp,#4*(32+2)] @ load len 276 277 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store 278 str @t[1], [sp,#4*(16+9)] 279 str @x[12],[sp,#4*(16+12)] 280 str @t[2], [sp,#4*(16+13)] 281 str @x[14],[sp,#4*(16+14)] 282 283 @ at this point we have first half of 512-bit result in 284 @ @x[0-7] and second half at sp+4*(16+8) 285 286 cmp @t[3],#64 @ done yet? 287#ifdef __thumb2__ 288 itete lo 289#endif 290 addlo r12,sp,#4*(0) @ shortcut or ... 291 ldrhs r12,[sp,#4*(32+1)] @ ... load inp 292 addlo r14,sp,#4*(0) @ shortcut or ... 293 ldrhs r14,[sp,#4*(32+0)] @ ... load out 294 295 ldr @t[0],[sp,#4*(0)] @ load key material 296 ldr @t[1],[sp,#4*(1)] 297 298#if __ARM_ARCH__>=6 || !defined(__ARMEB__) 299# if __ARM_ARCH__<7 300 orr @t[2],r12,r14 301 tst @t[2],#3 @ are input and output aligned? 302 ldr @t[2],[sp,#4*(2)] 303 bne .Lunaligned 304 cmp @t[3],#64 @ restore flags 305# else 306 ldr @t[2],[sp,#4*(2)] 307# endif 308 ldr @t[3],[sp,#4*(3)] 309 310 add @x[0],@x[0],@t[0] @ accumulate key material 311 add @x[1],@x[1],@t[1] 312# ifdef __thumb2__ 313 itt hs 314# endif 315 ldrhs @t[0],[r12],#16 @ load input 316 ldrhs @t[1],[r12,#-12] 317 318 add @x[2],@x[2],@t[2] 319 add @x[3],@x[3],@t[3] 320# ifdef __thumb2__ 321 itt hs 322# endif 323 ldrhs @t[2],[r12,#-8] 324 ldrhs @t[3],[r12,#-4] 325# if __ARM_ARCH__>=6 && defined(__ARMEB__) 326 rev @x[0],@x[0] 327 rev @x[1],@x[1] 328 rev @x[2],@x[2] 329 rev @x[3],@x[3] 330# endif 331# ifdef __thumb2__ 332 itt hs 333# endif 334 eorhs @x[0],@x[0],@t[0] @ xor with input 335 eorhs @x[1],@x[1],@t[1] 336 add @t[0],sp,#4*(4) 337 str @x[0],[r14],#16 @ store output 338# ifdef __thumb2__ 339 itt hs 340# endif 341 eorhs @x[2],@x[2],@t[2] 342 eorhs @x[3],@x[3],@t[3] 343 ldmia @t[0],{@t[0]-@t[3]} @ load key material 344 str @x[1],[r14,#-12] 345 str @x[2],[r14,#-8] 346 str @x[3],[r14,#-4] 347 348 add @x[4],@x[4],@t[0] @ accumulate key material 349 add @x[5],@x[5],@t[1] 350# ifdef __thumb2__ 351 itt hs 352# endif 353 ldrhs @t[0],[r12],#16 @ load input 354 ldrhs @t[1],[r12,#-12] 355 add @x[6],@x[6],@t[2] 356 add @x[7],@x[7],@t[3] 357# ifdef __thumb2__ 358 itt hs 359# endif 360 ldrhs @t[2],[r12,#-8] 361 ldrhs @t[3],[r12,#-4] 362# if __ARM_ARCH__>=6 && defined(__ARMEB__) 363 rev @x[4],@x[4] 364 rev @x[5],@x[5] 365 rev @x[6],@x[6] 366 rev @x[7],@x[7] 367# endif 368# ifdef __thumb2__ 369 itt hs 370# endif 371 eorhs @x[4],@x[4],@t[0] 372 eorhs @x[5],@x[5],@t[1] 373 add @t[0],sp,#4*(8) 374 str @x[4],[r14],#16 @ store output 375# ifdef __thumb2__ 376 itt hs 377# endif 378 eorhs @x[6],@x[6],@t[2] 379 eorhs @x[7],@x[7],@t[3] 380 str @x[5],[r14,#-12] 381 ldmia @t[0],{@t[0]-@t[3]} @ load key material 382 str @x[6],[r14,#-8] 383 add @x[0],sp,#4*(16+8) 384 str @x[7],[r14,#-4] 385 386 ldmia @x[0],{@x[0]-@x[7]} @ load second half 387 388 add @x[0],@x[0],@t[0] @ accumulate key material 389 add @x[1],@x[1],@t[1] 390# ifdef __thumb2__ 391 itt hs 392# endif 393 ldrhs @t[0],[r12],#16 @ load input 394 ldrhs @t[1],[r12,#-12] 395# ifdef __thumb2__ 396 itt hi 397# endif 398 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it 399 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it 400 add @x[2],@x[2],@t[2] 401 add @x[3],@x[3],@t[3] 402# ifdef __thumb2__ 403 itt hs 404# endif 405 ldrhs @t[2],[r12,#-8] 406 ldrhs @t[3],[r12,#-4] 407# if __ARM_ARCH__>=6 && defined(__ARMEB__) 408 rev @x[0],@x[0] 409 rev @x[1],@x[1] 410 rev @x[2],@x[2] 411 rev @x[3],@x[3] 412# endif 413# ifdef __thumb2__ 414 itt hs 415# endif 416 eorhs @x[0],@x[0],@t[0] 417 eorhs @x[1],@x[1],@t[1] 418 add @t[0],sp,#4*(12) 419 str @x[0],[r14],#16 @ store output 420# ifdef __thumb2__ 421 itt hs 422# endif 423 eorhs @x[2],@x[2],@t[2] 424 eorhs @x[3],@x[3],@t[3] 425 str @x[1],[r14,#-12] 426 ldmia @t[0],{@t[0]-@t[3]} @ load key material 427 str @x[2],[r14,#-8] 428 str @x[3],[r14,#-4] 429 430 add @x[4],@x[4],@t[0] @ accumulate key material 431 add @x[5],@x[5],@t[1] 432# ifdef __thumb2__ 433 itt hi 434# endif 435 addhi @t[0],@t[0],#1 @ next counter value 436 strhi @t[0],[sp,#4*(12)] @ save next counter value 437# ifdef __thumb2__ 438 itt hs 439# endif 440 ldrhs @t[0],[r12],#16 @ load input 441 ldrhs @t[1],[r12,#-12] 442 add @x[6],@x[6],@t[2] 443 add @x[7],@x[7],@t[3] 444# ifdef __thumb2__ 445 itt hs 446# endif 447 ldrhs @t[2],[r12,#-8] 448 ldrhs @t[3],[r12,#-4] 449# if __ARM_ARCH__>=6 && defined(__ARMEB__) 450 rev @x[4],@x[4] 451 rev @x[5],@x[5] 452 rev @x[6],@x[6] 453 rev @x[7],@x[7] 454# endif 455# ifdef __thumb2__ 456 itt hs 457# endif 458 eorhs @x[4],@x[4],@t[0] 459 eorhs @x[5],@x[5],@t[1] 460# ifdef __thumb2__ 461 it ne 462# endif 463 ldrne @t[0],[sp,#4*(32+2)] @ re-load len 464# ifdef __thumb2__ 465 itt hs 466# endif 467 eorhs @x[6],@x[6],@t[2] 468 eorhs @x[7],@x[7],@t[3] 469 str @x[4],[r14],#16 @ store output 470 str @x[5],[r14,#-12] 471# ifdef __thumb2__ 472 it hs 473# endif 474 subhs @t[3],@t[0],#64 @ len-=64 475 str @x[6],[r14,#-8] 476 str @x[7],[r14,#-4] 477 bhi .Loop_outer 478 479 beq .Ldone 480# if __ARM_ARCH__<7 481 b .Ltail 482 483.align 4 484.Lunaligned: @ unaligned endian-neutral path 485 cmp @t[3],#64 @ restore flags 486# endif 487#endif 488#if __ARM_ARCH__<7 489 ldr @t[3],[sp,#4*(3)] 490___ 491for ($i=0;$i<16;$i+=4) { 492my $j=$i&0x7; 493 494$code.=<<___ if ($i==4); 495 add @x[0],sp,#4*(16+8) 496___ 497$code.=<<___ if ($i==8); 498 ldmia @x[0],{@x[0]-@x[7]} @ load second half 499# ifdef __thumb2__ 500 itt hi 501# endif 502 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" 503 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" 504___ 505$code.=<<___; 506 add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material 507___ 508$code.=<<___ if ($i==12); 509# ifdef __thumb2__ 510 itt hi 511# endif 512 addhi @t[0],@t[0],#1 @ next counter value 513 strhi @t[0],[sp,#4*(12)] @ save next counter value 514___ 515$code.=<<___; 516 add @x[$j+1],@x[$j+1],@t[1] 517 add @x[$j+2],@x[$j+2],@t[2] 518# ifdef __thumb2__ 519 itete lo 520# endif 521 eorlo @t[0],@t[0],@t[0] @ zero or ... 522 ldrhsb @t[0],[r12],#16 @ ... load input 523 eorlo @t[1],@t[1],@t[1] 524 ldrhsb @t[1],[r12,#-12] 525 526 add @x[$j+3],@x[$j+3],@t[3] 527# ifdef __thumb2__ 528 itete lo 529# endif 530 eorlo @t[2],@t[2],@t[2] 531 ldrhsb @t[2],[r12,#-8] 532 eorlo @t[3],@t[3],@t[3] 533 ldrhsb @t[3],[r12,#-4] 534 535 eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) 536 eor @x[$j+1],@t[1],@x[$j+1] 537# ifdef __thumb2__ 538 itt hs 539# endif 540 ldrhsb @t[0],[r12,#-15] @ load more input 541 ldrhsb @t[1],[r12,#-11] 542 eor @x[$j+2],@t[2],@x[$j+2] 543 strb @x[$j+0],[r14],#16 @ store output 544 eor @x[$j+3],@t[3],@x[$j+3] 545# ifdef __thumb2__ 546 itt hs 547# endif 548 ldrhsb @t[2],[r12,#-7] 549 ldrhsb @t[3],[r12,#-3] 550 strb @x[$j+1],[r14,#-12] 551 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 552 strb @x[$j+2],[r14,#-8] 553 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 554# ifdef __thumb2__ 555 itt hs 556# endif 557 ldrhsb @t[0],[r12,#-14] @ load more input 558 ldrhsb @t[1],[r12,#-10] 559 strb @x[$j+3],[r14,#-4] 560 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 561 strb @x[$j+0],[r14,#-15] 562 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 563# ifdef __thumb2__ 564 itt hs 565# endif 566 ldrhsb @t[2],[r12,#-6] 567 ldrhsb @t[3],[r12,#-2] 568 strb @x[$j+1],[r14,#-11] 569 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 570 strb @x[$j+2],[r14,#-7] 571 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 572# ifdef __thumb2__ 573 itt hs 574# endif 575 ldrhsb @t[0],[r12,#-13] @ load more input 576 ldrhsb @t[1],[r12,#-9] 577 strb @x[$j+3],[r14,#-3] 578 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 579 strb @x[$j+0],[r14,#-14] 580 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 581# ifdef __thumb2__ 582 itt hs 583# endif 584 ldrhsb @t[2],[r12,#-5] 585 ldrhsb @t[3],[r12,#-1] 586 strb @x[$j+1],[r14,#-10] 587 strb @x[$j+2],[r14,#-6] 588 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 589 strb @x[$j+3],[r14,#-2] 590 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 591 strb @x[$j+0],[r14,#-13] 592 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 593 strb @x[$j+1],[r14,#-9] 594 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 595 strb @x[$j+2],[r14,#-5] 596 strb @x[$j+3],[r14,#-1] 597___ 598$code.=<<___ if ($i<12); 599 add @t[0],sp,#4*(4+$i) 600 ldmia @t[0],{@t[0]-@t[3]} @ load key material 601___ 602} 603$code.=<<___; 604# ifdef __thumb2__ 605 it ne 606# endif 607 ldrne @t[0],[sp,#4*(32+2)] @ re-load len 608# ifdef __thumb2__ 609 it hs 610# endif 611 subhs @t[3],@t[0],#64 @ len-=64 612 bhi .Loop_outer 613 614 beq .Ldone 615#endif 616 617.Ltail: 618 ldr r12,[sp,#4*(32+1)] @ load inp 619 add @t[1],sp,#4*(0) 620 ldr r14,[sp,#4*(32+0)] @ load out 621 622.Loop_tail: 623 ldrb @t[2],[@t[1]],#1 @ read buffer on stack 624 ldrb @t[3],[r12],#1 @ read input 625 subs @t[0],@t[0],#1 626 eor @t[3],@t[3],@t[2] 627 strb @t[3],[r14],#1 @ store output 628 bne .Loop_tail 629 630.Ldone: 631 add sp,sp,#4*(32+3) 632.Lno_data: 633 ldmia sp!,{r4-r11,pc} 634.size ChaCha20_ctr32,.-ChaCha20_ctr32 635___ 636 637{{{ 638my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = 639 map("q$_",(0..15)); 640 641sub NEONROUND { 642my $odd = pop; 643my ($a,$b,$c,$d,$t)=@_; 644 645 ( 646 "&vadd_i32 ($a,$a,$b)", 647 "&veor ($d,$d,$a)", 648 "&vrev32_16 ($d,$d)", # vrot ($d,16) 649 650 "&vadd_i32 ($c,$c,$d)", 651 "&veor ($t,$b,$c)", 652 "&vshr_u32 ($b,$t,20)", 653 "&vsli_32 ($b,$t,12)", 654 655 "&vadd_i32 ($a,$a,$b)", 656 "&veor ($t,$d,$a)", 657 "&vshr_u32 ($d,$t,24)", 658 "&vsli_32 ($d,$t,8)", 659 660 "&vadd_i32 ($c,$c,$d)", 661 "&veor ($t,$b,$c)", 662 "&vshr_u32 ($b,$t,25)", 663 "&vsli_32 ($b,$t,7)", 664 665 "&vext_8 ($c,$c,$c,8)", 666 "&vext_8 ($b,$b,$b,$odd?12:4)", 667 "&vext_8 ($d,$d,$d,$odd?4:12)" 668 ); 669} 670 671$code.=<<___; 672#if __ARM_MAX_ARCH__>=7 673.arch armv7-a 674.fpu neon 675 676.type ChaCha20_neon,%function 677.align 5 678ChaCha20_neon: 679 ldr r12,[sp,#0] @ pull pointer to counter and nonce 680 stmdb sp!,{r0-r2,r4-r11,lr} 681.LChaCha20_neon: 682 adr r14,.Lsigma 683 vstmdb sp!,{d8-d15} @ ABI spec says so 684 stmdb sp!,{r0-r3} 685 686 vld1.32 {$b0-$c0},[r3] @ load key 687 ldmia r3,{r4-r11} @ load key 688 689 sub sp,sp,#4*(16+16) 690 vld1.32 {$d0},[r12] @ load counter and nonce 691 add r12,sp,#4*8 692 ldmia r14,{r0-r3} @ load sigma 693 vld1.32 {$a0},[r14]! @ load sigma 694 vld1.32 {$t0},[r14] @ one 695 vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce 696 vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key 697 698 str r10,[sp,#4*(16+10)] @ off-load "@x[10]" 699 str r11,[sp,#4*(16+11)] @ off-load "@x[11]" 700 vshl.i32 $t1#lo,$t0#lo,#1 @ two 701 vstr $t0#lo,[sp,#4*(16+0)] 702 vshl.i32 $t2#lo,$t0#lo,#2 @ four 703 vstr $t1#lo,[sp,#4*(16+2)] 704 vmov $a1,$a0 705 vstr $t2#lo,[sp,#4*(16+4)] 706 vmov $a2,$a0 707 vmov $b1,$b0 708 vmov $b2,$b0 709 b .Loop_neon_enter 710 711.align 4 712.Loop_neon_outer: 713 ldmia sp,{r0-r9} @ load key material 714 cmp @t[3],#64*2 @ if len<=64*2 715 bls .Lbreak_neon @ switch to integer-only 716 vmov $a1,$a0 717 str @t[3],[sp,#4*(32+2)] @ save len 718 vmov $a2,$a0 719 str r12, [sp,#4*(32+1)] @ save inp 720 vmov $b1,$b0 721 str r14, [sp,#4*(32+0)] @ save out 722 vmov $b2,$b0 723.Loop_neon_enter: 724 ldr @t[3], [sp,#4*(15)] 725 vadd.i32 $d1,$d0,$t0 @ counter+1 726 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 727 vmov $c1,$c0 728 ldr @t[2], [sp,#4*(13)] 729 vmov $c2,$c0 730 ldr @x[14],[sp,#4*(14)] 731 vadd.i32 $d2,$d1,$t0 @ counter+2 732 str @t[3], [sp,#4*(16+15)] 733 mov @t[3],#10 734 add @x[12],@x[12],#3 @ counter+3 735 b .Loop_neon 736 737.align 4 738.Loop_neon: 739 subs @t[3],@t[3],#1 740___ 741 my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); 742 my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); 743 my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); 744 my @thread3=&ROUND(0,4,8,12); 745 746 foreach (@thread0) { 747 eval; eval(shift(@thread3)); 748 eval(shift(@thread1)); eval(shift(@thread3)); 749 eval(shift(@thread2)); eval(shift(@thread3)); 750 } 751 752 @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); 753 @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); 754 @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); 755 @thread3=&ROUND(0,5,10,15); 756 757 foreach (@thread0) { 758 eval; eval(shift(@thread3)); 759 eval(shift(@thread1)); eval(shift(@thread3)); 760 eval(shift(@thread2)); eval(shift(@thread3)); 761 } 762$code.=<<___; 763 bne .Loop_neon 764 765 add @t[3],sp,#32 766 vld1.32 {$t0-$t1},[sp] @ load key material 767 vld1.32 {$t2-$t3},[@t[3]] 768 769 ldr @t[3],[sp,#4*(32+2)] @ load len 770 771 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store 772 str @t[1], [sp,#4*(16+9)] 773 str @x[12],[sp,#4*(16+12)] 774 str @t[2], [sp,#4*(16+13)] 775 str @x[14],[sp,#4*(16+14)] 776 777 @ at this point we have first half of 512-bit result in 778 @ @x[0-7] and second half at sp+4*(16+8) 779 780 ldr r12,[sp,#4*(32+1)] @ load inp 781 ldr r14,[sp,#4*(32+0)] @ load out 782 783 vadd.i32 $a0,$a0,$t0 @ accumulate key material 784 vadd.i32 $a1,$a1,$t0 785 vadd.i32 $a2,$a2,$t0 786 vldr $t0#lo,[sp,#4*(16+0)] @ one 787 788 vadd.i32 $b0,$b0,$t1 789 vadd.i32 $b1,$b1,$t1 790 vadd.i32 $b2,$b2,$t1 791 vldr $t1#lo,[sp,#4*(16+2)] @ two 792 793 vadd.i32 $c0,$c0,$t2 794 vadd.i32 $c1,$c1,$t2 795 vadd.i32 $c2,$c2,$t2 796 vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 797 vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 798 799 vadd.i32 $d0,$d0,$t3 800 vadd.i32 $d1,$d1,$t3 801 vadd.i32 $d2,$d2,$t3 802 803 cmp @t[3],#64*4 804 blo .Ltail_neon 805 806 vld1.8 {$t0-$t1},[r12]! @ load input 807 mov @t[3],sp 808 vld1.8 {$t2-$t3},[r12]! 809 veor $a0,$a0,$t0 @ xor with input 810 veor $b0,$b0,$t1 811 vld1.8 {$t0-$t1},[r12]! 812 veor $c0,$c0,$t2 813 veor $d0,$d0,$t3 814 vld1.8 {$t2-$t3},[r12]! 815 816 veor $a1,$a1,$t0 817 vst1.8 {$a0-$b0},[r14]! @ store output 818 veor $b1,$b1,$t1 819 vld1.8 {$t0-$t1},[r12]! 820 veor $c1,$c1,$t2 821 vst1.8 {$c0-$d0},[r14]! 822 veor $d1,$d1,$t3 823 vld1.8 {$t2-$t3},[r12]! 824 825 veor $a2,$a2,$t0 826 vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration 827 veor $t0#hi,$t0#hi,$t0#hi 828 vldr $t0#lo,[sp,#4*(16+4)] @ four 829 veor $b2,$b2,$t1 830 vld1.32 {$c0-$d0},[@t[3]] 831 veor $c2,$c2,$t2 832 vst1.8 {$a1-$b1},[r14]! 833 veor $d2,$d2,$t3 834 vst1.8 {$c1-$d1},[r14]! 835 836 vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value 837 vldr $t0#lo,[sp,#4*(16+0)] @ one 838 839 ldmia sp,{@t[0]-@t[3]} @ load key material 840 add @x[0],@x[0],@t[0] @ accumulate key material 841 ldr @t[0],[r12],#16 @ load input 842 vst1.8 {$a2-$b2},[r14]! 843 add @x[1],@x[1],@t[1] 844 ldr @t[1],[r12,#-12] 845 vst1.8 {$c2-$d2},[r14]! 846 add @x[2],@x[2],@t[2] 847 ldr @t[2],[r12,#-8] 848 add @x[3],@x[3],@t[3] 849 ldr @t[3],[r12,#-4] 850# ifdef __ARMEB__ 851 rev @x[0],@x[0] 852 rev @x[1],@x[1] 853 rev @x[2],@x[2] 854 rev @x[3],@x[3] 855# endif 856 eor @x[0],@x[0],@t[0] @ xor with input 857 add @t[0],sp,#4*(4) 858 eor @x[1],@x[1],@t[1] 859 str @x[0],[r14],#16 @ store output 860 eor @x[2],@x[2],@t[2] 861 str @x[1],[r14,#-12] 862 eor @x[3],@x[3],@t[3] 863 ldmia @t[0],{@t[0]-@t[3]} @ load key material 864 str @x[2],[r14,#-8] 865 str @x[3],[r14,#-4] 866 867 add @x[4],@x[4],@t[0] @ accumulate key material 868 ldr @t[0],[r12],#16 @ load input 869 add @x[5],@x[5],@t[1] 870 ldr @t[1],[r12,#-12] 871 add @x[6],@x[6],@t[2] 872 ldr @t[2],[r12,#-8] 873 add @x[7],@x[7],@t[3] 874 ldr @t[3],[r12,#-4] 875# ifdef __ARMEB__ 876 rev @x[4],@x[4] 877 rev @x[5],@x[5] 878 rev @x[6],@x[6] 879 rev @x[7],@x[7] 880# endif 881 eor @x[4],@x[4],@t[0] 882 add @t[0],sp,#4*(8) 883 eor @x[5],@x[5],@t[1] 884 str @x[4],[r14],#16 @ store output 885 eor @x[6],@x[6],@t[2] 886 str @x[5],[r14,#-12] 887 eor @x[7],@x[7],@t[3] 888 ldmia @t[0],{@t[0]-@t[3]} @ load key material 889 str @x[6],[r14,#-8] 890 add @x[0],sp,#4*(16+8) 891 str @x[7],[r14,#-4] 892 893 ldmia @x[0],{@x[0]-@x[7]} @ load second half 894 895 add @x[0],@x[0],@t[0] @ accumulate key material 896 ldr @t[0],[r12],#16 @ load input 897 add @x[1],@x[1],@t[1] 898 ldr @t[1],[r12,#-12] 899# ifdef __thumb2__ 900 it hi 901# endif 902 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it 903 add @x[2],@x[2],@t[2] 904 ldr @t[2],[r12,#-8] 905# ifdef __thumb2__ 906 it hi 907# endif 908 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it 909 add @x[3],@x[3],@t[3] 910 ldr @t[3],[r12,#-4] 911# ifdef __ARMEB__ 912 rev @x[0],@x[0] 913 rev @x[1],@x[1] 914 rev @x[2],@x[2] 915 rev @x[3],@x[3] 916# endif 917 eor @x[0],@x[0],@t[0] 918 add @t[0],sp,#4*(12) 919 eor @x[1],@x[1],@t[1] 920 str @x[0],[r14],#16 @ store output 921 eor @x[2],@x[2],@t[2] 922 str @x[1],[r14,#-12] 923 eor @x[3],@x[3],@t[3] 924 ldmia @t[0],{@t[0]-@t[3]} @ load key material 925 str @x[2],[r14,#-8] 926 str @x[3],[r14,#-4] 927 928 add @x[4],@x[4],@t[0] @ accumulate key material 929 add @t[0],@t[0],#4 @ next counter value 930 add @x[5],@x[5],@t[1] 931 str @t[0],[sp,#4*(12)] @ save next counter value 932 ldr @t[0],[r12],#16 @ load input 933 add @x[6],@x[6],@t[2] 934 add @x[4],@x[4],#3 @ counter+3 935 ldr @t[1],[r12,#-12] 936 add @x[7],@x[7],@t[3] 937 ldr @t[2],[r12,#-8] 938 ldr @t[3],[r12,#-4] 939# ifdef __ARMEB__ 940 rev @x[4],@x[4] 941 rev @x[5],@x[5] 942 rev @x[6],@x[6] 943 rev @x[7],@x[7] 944# endif 945 eor @x[4],@x[4],@t[0] 946# ifdef __thumb2__ 947 it hi 948# endif 949 ldrhi @t[0],[sp,#4*(32+2)] @ re-load len 950 eor @x[5],@x[5],@t[1] 951 eor @x[6],@x[6],@t[2] 952 str @x[4],[r14],#16 @ store output 953 eor @x[7],@x[7],@t[3] 954 str @x[5],[r14,#-12] 955 sub @t[3],@t[0],#64*4 @ len-=64*4 956 str @x[6],[r14,#-8] 957 str @x[7],[r14,#-4] 958 bhi .Loop_neon_outer 959 960 b .Ldone_neon 961 962.align 4 963.Lbreak_neon: 964 @ harmonize NEON and integer-only stack frames: load data 965 @ from NEON frame, but save to integer-only one; distance 966 @ between the two is 4*(32+4+16-32)=4*(20). 967 968 str @t[3], [sp,#4*(20+32+2)] @ save len 969 add @t[3],sp,#4*(32+4) 970 str r12, [sp,#4*(20+32+1)] @ save inp 971 str r14, [sp,#4*(20+32+0)] @ save out 972 973 ldr @x[12],[sp,#4*(16+10)] 974 ldr @x[14],[sp,#4*(16+11)] 975 vldmia @t[3],{d8-d15} @ fulfill ABI requirement 976 str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" 977 str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" 978 979 ldr @t[3], [sp,#4*(15)] 980 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 981 ldr @t[2], [sp,#4*(13)] 982 ldr @x[14],[sp,#4*(14)] 983 str @t[3], [sp,#4*(20+16+15)] 984 add @t[3],sp,#4*(20) 985 vst1.32 {$a0-$b0},[@t[3]]! @ copy key 986 add sp,sp,#4*(20) @ switch frame 987 vst1.32 {$c0-$d0},[@t[3]] 988 mov @t[3],#10 989 b .Loop @ go integer-only 990 991.align 4 992.Ltail_neon: 993 cmp @t[3],#64*3 994 bhs .L192_or_more_neon 995 cmp @t[3],#64*2 996 bhs .L128_or_more_neon 997 cmp @t[3],#64*1 998 bhs .L64_or_more_neon 999 1000 add @t[0],sp,#4*(8) 1001 vst1.8 {$a0-$b0},[sp] 1002 add @t[2],sp,#4*(0) 1003 vst1.8 {$c0-$d0},[@t[0]] 1004 b .Loop_tail_neon 1005 1006.align 4 1007.L64_or_more_neon: 1008 vld1.8 {$t0-$t1},[r12]! 1009 vld1.8 {$t2-$t3},[r12]! 1010 veor $a0,$a0,$t0 1011 veor $b0,$b0,$t1 1012 veor $c0,$c0,$t2 1013 veor $d0,$d0,$t3 1014 vst1.8 {$a0-$b0},[r14]! 1015 vst1.8 {$c0-$d0},[r14]! 1016 1017 beq .Ldone_neon 1018 1019 add @t[0],sp,#4*(8) 1020 vst1.8 {$a1-$b1},[sp] 1021 add @t[2],sp,#4*(0) 1022 vst1.8 {$c1-$d1},[@t[0]] 1023 sub @t[3],@t[3],#64*1 @ len-=64*1 1024 b .Loop_tail_neon 1025 1026.align 4 1027.L128_or_more_neon: 1028 vld1.8 {$t0-$t1},[r12]! 1029 vld1.8 {$t2-$t3},[r12]! 1030 veor $a0,$a0,$t0 1031 veor $b0,$b0,$t1 1032 vld1.8 {$t0-$t1},[r12]! 1033 veor $c0,$c0,$t2 1034 veor $d0,$d0,$t3 1035 vld1.8 {$t2-$t3},[r12]! 1036 1037 veor $a1,$a1,$t0 1038 veor $b1,$b1,$t1 1039 vst1.8 {$a0-$b0},[r14]! 1040 veor $c1,$c1,$t2 1041 vst1.8 {$c0-$d0},[r14]! 1042 veor $d1,$d1,$t3 1043 vst1.8 {$a1-$b1},[r14]! 1044 vst1.8 {$c1-$d1},[r14]! 1045 1046 beq .Ldone_neon 1047 1048 add @t[0],sp,#4*(8) 1049 vst1.8 {$a2-$b2},[sp] 1050 add @t[2],sp,#4*(0) 1051 vst1.8 {$c2-$d2},[@t[0]] 1052 sub @t[3],@t[3],#64*2 @ len-=64*2 1053 b .Loop_tail_neon 1054 1055.align 4 1056.L192_or_more_neon: 1057 vld1.8 {$t0-$t1},[r12]! 1058 vld1.8 {$t2-$t3},[r12]! 1059 veor $a0,$a0,$t0 1060 veor $b0,$b0,$t1 1061 vld1.8 {$t0-$t1},[r12]! 1062 veor $c0,$c0,$t2 1063 veor $d0,$d0,$t3 1064 vld1.8 {$t2-$t3},[r12]! 1065 1066 veor $a1,$a1,$t0 1067 veor $b1,$b1,$t1 1068 vld1.8 {$t0-$t1},[r12]! 1069 veor $c1,$c1,$t2 1070 vst1.8 {$a0-$b0},[r14]! 1071 veor $d1,$d1,$t3 1072 vld1.8 {$t2-$t3},[r12]! 1073 1074 veor $a2,$a2,$t0 1075 vst1.8 {$c0-$d0},[r14]! 1076 veor $b2,$b2,$t1 1077 vst1.8 {$a1-$b1},[r14]! 1078 veor $c2,$c2,$t2 1079 vst1.8 {$c1-$d1},[r14]! 1080 veor $d2,$d2,$t3 1081 vst1.8 {$a2-$b2},[r14]! 1082 vst1.8 {$c2-$d2},[r14]! 1083 1084 beq .Ldone_neon 1085 1086 ldmia sp,{@t[0]-@t[3]} @ load key material 1087 add @x[0],@x[0],@t[0] @ accumulate key material 1088 add @t[0],sp,#4*(4) 1089 add @x[1],@x[1],@t[1] 1090 add @x[2],@x[2],@t[2] 1091 add @x[3],@x[3],@t[3] 1092 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1093 1094 add @x[4],@x[4],@t[0] @ accumulate key material 1095 add @t[0],sp,#4*(8) 1096 add @x[5],@x[5],@t[1] 1097 add @x[6],@x[6],@t[2] 1098 add @x[7],@x[7],@t[3] 1099 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1100# ifdef __ARMEB__ 1101 rev @x[0],@x[0] 1102 rev @x[1],@x[1] 1103 rev @x[2],@x[2] 1104 rev @x[3],@x[3] 1105 rev @x[4],@x[4] 1106 rev @x[5],@x[5] 1107 rev @x[6],@x[6] 1108 rev @x[7],@x[7] 1109# endif 1110 stmia sp,{@x[0]-@x[7]} 1111 add @x[0],sp,#4*(16+8) 1112 1113 ldmia @x[0],{@x[0]-@x[7]} @ load second half 1114 1115 add @x[0],@x[0],@t[0] @ accumulate key material 1116 add @t[0],sp,#4*(12) 1117 add @x[1],@x[1],@t[1] 1118 add @x[2],@x[2],@t[2] 1119 add @x[3],@x[3],@t[3] 1120 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1121 1122 add @x[4],@x[4],@t[0] @ accumulate key material 1123 add @t[0],sp,#4*(8) 1124 add @x[5],@x[5],@t[1] 1125 add @x[4],@x[4],#3 @ counter+3 1126 add @x[6],@x[6],@t[2] 1127 add @x[7],@x[7],@t[3] 1128 ldr @t[3],[sp,#4*(32+2)] @ re-load len 1129# ifdef __ARMEB__ 1130 rev @x[0],@x[0] 1131 rev @x[1],@x[1] 1132 rev @x[2],@x[2] 1133 rev @x[3],@x[3] 1134 rev @x[4],@x[4] 1135 rev @x[5],@x[5] 1136 rev @x[6],@x[6] 1137 rev @x[7],@x[7] 1138# endif 1139 stmia @t[0],{@x[0]-@x[7]} 1140 add @t[2],sp,#4*(0) 1141 sub @t[3],@t[3],#64*3 @ len-=64*3 1142 1143.Loop_tail_neon: 1144 ldrb @t[0],[@t[2]],#1 @ read buffer on stack 1145 ldrb @t[1],[r12],#1 @ read input 1146 subs @t[3],@t[3],#1 1147 eor @t[0],@t[0],@t[1] 1148 strb @t[0],[r14],#1 @ store output 1149 bne .Loop_tail_neon 1150 1151.Ldone_neon: 1152 add sp,sp,#4*(32+4) 1153 vldmia sp,{d8-d15} 1154 add sp,sp,#4*(16+3) 1155 ldmia sp!,{r4-r11,pc} 1156.size ChaCha20_neon,.-ChaCha20_neon 1157.extern OPENSSL_armcap_P 1158#endif 1159___ 1160}}} 1161 1162foreach (split("\n",$code)) { 1163 s/\`([^\`]*)\`/eval $1/geo; 1164 1165 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; 1166 1167 print $_,"\n"; 1168} 1169close STDOUT or die "error closing STDOUT: $!"; 1170