OLD | NEW |
1 #!/usr/bin/env perl | 1 #!/usr/bin/env perl |
2 # | 2 # |
3 # ==================================================================== | 3 # ==================================================================== |
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | 4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
5 # project. Rights for redistribution and usage in source and binary | 5 # project. Rights for redistribution and usage in source and binary |
6 # forms are granted according to the OpenSSL license. | 6 # forms are granted according to the OpenSSL license. |
7 # ==================================================================== | 7 # ==================================================================== |
8 # | 8 # |
9 # sha256/512_block procedure for x86_64. | 9 # sha256/512_block procedure for x86_64. |
10 # | 10 # |
(...skipping 22 matching lines...) Expand all Loading... |
33 # actually possible to noticeably improve overall ILP, instruction | 33 # actually possible to noticeably improve overall ILP, instruction |
34 # level parallelism, on a given CPU implementation in this case. | 34 # level parallelism, on a given CPU implementation in this case. |
35 # | 35 # |
36 # Special note on Intel EM64T. While Opteron CPU exhibits perfect | 36 # Special note on Intel EM64T. While Opteron CPU exhibits perfect |
37 # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], | 37 # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], |
38 # [currently available] EM64T CPUs apparently are far from it. On the | 38 # [currently available] EM64T CPUs apparently are far from it. On the |
39 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit | 39 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit |
40 # sha256_block:-( This is presumably because 64-bit shifts/rotates | 40 # sha256_block:-( This is presumably because 64-bit shifts/rotates |
41 # apparently are not atomic instructions, but implemented in microcode. | 41 # apparently are not atomic instructions, but implemented in microcode. |
42 | 42 |
43 $output=shift; | 43 $flavour = shift; |
| 44 $output = shift; |
| 45 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } |
| 46 |
| 47 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); |
44 | 48 |
45 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | 49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
46 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | 50 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or |
47 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | 51 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
48 die "can't locate x86_64-xlate.pl"; | 52 die "can't locate x86_64-xlate.pl"; |
49 | 53 |
50 open STDOUT,"| $^X $xlate $output"; | 54 open STDOUT,"| $^X $xlate $flavour $output"; |
51 | 55 |
52 if ($output =~ /512/) { | 56 if ($output =~ /512/) { |
53 $func="sha512_block_data_order"; | 57 $func="sha512_block_data_order"; |
54 $TABLE="K512"; | 58 $TABLE="K512"; |
55 $SZ=8; | 59 $SZ=8; |
56 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", | 60 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", |
57 "%r8", "%r9", "%r10","%r11"); | 61 "%r8", "%r9", "%r10","%r11"); |
58 ($T1,$a0,$a1,$a2)=("%r12","%r13","%r14","%r15"); | 62 ($T1,$a0,$a1,$a2)=("%r12","%r13","%r14","%r15"); |
59 @Sigma0=(28,34,39); | 63 @Sigma0=(28,34,39); |
60 @Sigma1=(14,18,41); | 64 @Sigma1=(14,18,41); |
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
179 .globl $func | 183 .globl $func |
180 .type $func,\@function,4 | 184 .type $func,\@function,4 |
181 .align 16 | 185 .align 16 |
182 $func: | 186 $func: |
183 push %rbx | 187 push %rbx |
184 push %rbp | 188 push %rbp |
185 push %r12 | 189 push %r12 |
186 push %r13 | 190 push %r13 |
187 push %r14 | 191 push %r14 |
188 push %r15 | 192 push %r15 |
189 » mov» %rsp,%rbp» » # copy %rsp | 193 » mov» %rsp,%r11» » # copy %rsp |
190 shl \$4,%rdx # num*16 | 194 shl \$4,%rdx # num*16 |
191 sub \$$framesz,%rsp | 195 sub \$$framesz,%rsp |
192 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ | 196 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ |
193 and \$-64,%rsp # align stack frame | 197 and \$-64,%rsp # align stack frame |
194 mov $ctx,$_ctx # save ctx, 1st arg | 198 mov $ctx,$_ctx # save ctx, 1st arg |
195 mov $inp,$_inp # save inp, 2nd arh | 199 mov $inp,$_inp # save inp, 2nd arh |
196 mov %rdx,$_end # save end pointer, "3rd" arg | 200 mov %rdx,$_end # save end pointer, "3rd" arg |
197 » mov» %rbp,$_rsp» » # save copy of %rsp | 201 » mov» %r11,$_rsp» » # save copy of %rsp |
| 202 .Lprologue: |
198 | 203 |
199 » .picmeup $Tbl | 204 » lea» $TABLE(%rip),$Tbl |
200 » lea» $TABLE-.($Tbl),$Tbl | |
201 | 205 |
202 mov $SZ*0($ctx),$A | 206 mov $SZ*0($ctx),$A |
203 mov $SZ*1($ctx),$B | 207 mov $SZ*1($ctx),$B |
204 mov $SZ*2($ctx),$C | 208 mov $SZ*2($ctx),$C |
205 mov $SZ*3($ctx),$D | 209 mov $SZ*3($ctx),$D |
206 mov $SZ*4($ctx),$E | 210 mov $SZ*4($ctx),$E |
207 mov $SZ*5($ctx),$F | 211 mov $SZ*5($ctx),$F |
208 mov $SZ*6($ctx),$G | 212 mov $SZ*6($ctx),$G |
209 mov $SZ*7($ctx),$H | 213 mov $SZ*7($ctx),$H |
210 jmp .Lloop | 214 jmp .Lloop |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
250 mov $A,$SZ*0($ctx) | 254 mov $A,$SZ*0($ctx) |
251 mov $B,$SZ*1($ctx) | 255 mov $B,$SZ*1($ctx) |
252 mov $C,$SZ*2($ctx) | 256 mov $C,$SZ*2($ctx) |
253 mov $D,$SZ*3($ctx) | 257 mov $D,$SZ*3($ctx) |
254 mov $E,$SZ*4($ctx) | 258 mov $E,$SZ*4($ctx) |
255 mov $F,$SZ*5($ctx) | 259 mov $F,$SZ*5($ctx) |
256 mov $G,$SZ*6($ctx) | 260 mov $G,$SZ*6($ctx) |
257 mov $H,$SZ*7($ctx) | 261 mov $H,$SZ*7($ctx) |
258 jb .Lloop | 262 jb .Lloop |
259 | 263 |
260 » mov» $_rsp,%rsp | 264 » mov» $_rsp,%rsi |
261 » pop» %r15 | 265 » mov» (%rsi),%r15 |
262 » pop» %r14 | 266 » mov» 8(%rsi),%r14 |
263 » pop» %r13 | 267 » mov» 16(%rsi),%r13 |
264 » pop» %r12 | 268 » mov» 24(%rsi),%r12 |
265 » pop» %rbp | 269 » mov» 32(%rsi),%rbp |
266 » pop» %rbx | 270 » mov» 40(%rsi),%rbx |
267 | 271 » lea» 48(%rsi),%rsp |
| 272 .Lepilogue: |
268 ret | 273 ret |
269 .size $func,.-$func | 274 .size $func,.-$func |
270 ___ | 275 ___ |
271 | 276 |
272 if ($SZ==4) { | 277 if ($SZ==4) { |
273 $code.=<<___; | 278 $code.=<<___; |
274 .align 64 | 279 .align 64 |
275 .type $TABLE,\@object | 280 .type $TABLE,\@object |
276 $TABLE: | 281 $TABLE: |
277 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | 282 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 |
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
332 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 | 337 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 |
333 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 | 338 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 |
334 .quad 0x113f9804bef90dae,0x1b710b35131c471b | 339 .quad 0x113f9804bef90dae,0x1b710b35131c471b |
335 .quad 0x28db77f523047d84,0x32caab7b40c72493 | 340 .quad 0x28db77f523047d84,0x32caab7b40c72493 |
336 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c | 341 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c |
337 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a | 342 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a |
338 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 | 343 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 |
339 ___ | 344 ___ |
340 } | 345 } |
341 | 346 |
| 347 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, |
| 348 # CONTEXT *context,DISPATCHER_CONTEXT *disp) |
| 349 if ($win64) { |
| 350 $rec="%rcx"; |
| 351 $frame="%rdx"; |
| 352 $context="%r8"; |
| 353 $disp="%r9"; |
| 354 |
| 355 $code.=<<___; |
| 356 .extern __imp_RtlVirtualUnwind |
| 357 .type se_handler,\@abi-omnipotent |
| 358 .align 16 |
| 359 se_handler: |
| 360 push %rsi |
| 361 push %rdi |
| 362 push %rbx |
| 363 push %rbp |
| 364 push %r12 |
| 365 push %r13 |
| 366 push %r14 |
| 367 push %r15 |
| 368 pushfq |
| 369 sub \$64,%rsp |
| 370 |
| 371 mov 120($context),%rax # pull context->Rax |
| 372 mov 248($context),%rbx # pull context->Rip |
| 373 |
| 374 lea .Lprologue(%rip),%r10 |
| 375 cmp %r10,%rbx # context->Rip<.Lprologue |
| 376 jb .Lin_prologue |
| 377 |
| 378 mov 152($context),%rax # pull context->Rsp |
| 379 |
| 380 lea .Lepilogue(%rip),%r10 |
| 381 cmp %r10,%rbx # context->Rip>=.Lepilogue |
| 382 jae .Lin_prologue |
| 383 |
| 384 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp |
| 385 lea 48(%rax),%rax |
| 386 |
| 387 mov -8(%rax),%rbx |
| 388 mov -16(%rax),%rbp |
| 389 mov -24(%rax),%r12 |
| 390 mov -32(%rax),%r13 |
| 391 mov -40(%rax),%r14 |
| 392 mov -48(%rax),%r15 |
| 393 mov %rbx,144($context) # restore context->Rbx |
| 394 mov %rbp,160($context) # restore context->Rbp |
| 395 mov %r12,216($context) # restore context->R12 |
| 396 mov %r13,224($context) # restore context->R13 |
| 397 mov %r14,232($context) # restore context->R14 |
| 398 mov %r15,240($context) # restore context->R15 |
| 399 |
| 400 .Lin_prologue: |
| 401 mov 8(%rax),%rdi |
| 402 mov 16(%rax),%rsi |
| 403 mov %rax,152($context) # restore context->Rsp |
| 404 mov %rsi,168($context) # restore context->Rsi |
| 405 mov %rdi,176($context) # restore context->Rdi |
| 406 |
| 407 mov 40($disp),%rdi # disp->ContextRecord |
| 408 mov $context,%rsi # context |
| 409 mov \$154,%ecx # sizeof(CONTEXT) |
| 410 .long 0xa548f3fc # cld; rep movsq |
| 411 |
| 412 mov $disp,%rsi |
| 413 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER |
| 414 mov 8(%rsi),%rdx # arg2, disp->ImageBase |
| 415 mov 0(%rsi),%r8 # arg3, disp->ControlPc |
| 416 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry |
| 417 mov 40(%rsi),%r10 # disp->ContextRecord |
| 418 lea 56(%rsi),%r11 # &disp->HandlerData |
| 419 lea 24(%rsi),%r12 # &disp->EstablisherFrame |
| 420 mov %r10,32(%rsp) # arg5 |
| 421 mov %r11,40(%rsp) # arg6 |
| 422 mov %r12,48(%rsp) # arg7 |
| 423 mov %rcx,56(%rsp) # arg8, (NULL) |
| 424 call *__imp_RtlVirtualUnwind(%rip) |
| 425 |
| 426 mov \$1,%eax # ExceptionContinueSearch |
| 427 add \$64,%rsp |
| 428 popfq |
| 429 pop %r15 |
| 430 pop %r14 |
| 431 pop %r13 |
| 432 pop %r12 |
| 433 pop %rbp |
| 434 pop %rbx |
| 435 pop %rdi |
| 436 pop %rsi |
| 437 ret |
| 438 .size se_handler,.-se_handler |
| 439 |
| 440 .section .pdata |
| 441 .align 4 |
| 442 .rva .LSEH_begin_$func |
| 443 .rva .LSEH_end_$func |
| 444 .rva .LSEH_info_$func |
| 445 |
| 446 .section .xdata |
| 447 .align 8 |
| 448 .LSEH_info_$func: |
| 449 .byte 9,0,0,0 |
| 450 .rva se_handler |
| 451 ___ |
| 452 } |
| 453 |
342 $code =~ s/\`([^\`]*)\`/eval $1/gem; | 454 $code =~ s/\`([^\`]*)\`/eval $1/gem; |
343 print $code; | 455 print $code; |
344 close STDOUT; | 456 close STDOUT; |
OLD | NEW |