| OLD | NEW |
| 1 #!/usr/bin/env perl | 1 #!/usr/bin/env perl |
| 2 # | 2 # |
| 3 # ==================================================================== | 3 # ==================================================================== |
| 4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | 4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
| 5 # project. The module is, however, dual licensed under OpenSSL and | 5 # project. The module is, however, dual licensed under OpenSSL and |
| 6 # CRYPTOGAMS licenses depending on where you obtain it. For further | 6 # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 7 # details see http://www.openssl.org/~appro/cryptogams/. | 7 # details see http://www.openssl.org/~appro/cryptogams/. |
| 8 # ==================================================================== | 8 # ==================================================================== |
| 9 # | 9 # |
| 10 # sha1_block procedure for x86_64. | 10 # sha1_block procedure for x86_64. |
| (...skipping 11 matching lines...) Expand all Loading... |
| 22 # reaches for even more registers through dynamic aliasing, and EM64T | 22 # reaches for even more registers through dynamic aliasing, and EM64T |
| 23 # core must have managed to run-time optimize even 32-bit code just as | 23 # core must have managed to run-time optimize even 32-bit code just as |
| 24 # good as 64-bit one. Performance improvement is summarized in the | 24 # good as 64-bit one. Performance improvement is summarized in the |
| 25 # following table: | 25 # following table: |
| 26 # | 26 # |
| 27 # gcc 3.4 32-bit asm cycles/byte | 27 # gcc 3.4 32-bit asm cycles/byte |
| 28 # Opteron +45% +20% 6.8 | 28 # Opteron +45% +20% 6.8 |
| 29 # Xeon P4 +65% +0% 9.9 | 29 # Xeon P4 +65% +0% 9.9 |
| 30 # Core2 +60% +10% 7.0 | 30 # Core2 +60% +10% 7.0 |
| 31 | 31 |
| 32 $output=shift; | 32 $flavour = shift; |
| 33 $output = shift; |
| 34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } |
| 35 |
| 36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); |
| 33 | 37 |
| 34 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | 38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 35 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | 39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or |
| 36 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | 40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
| 37 die "can't locate x86_64-xlate.pl"; | 41 die "can't locate x86_64-xlate.pl"; |
| 38 | 42 |
| 39 open STDOUT,"| $^X $xlate $output"; | 43 open STDOUT,"| $^X $xlate $flavour $output"; |
| 40 | 44 |
| 41 $ctx="%rdi"; # 1st arg | 45 $ctx="%rdi"; # 1st arg |
| 42 $inp="%rsi"; # 2nd arg | 46 $inp="%rsi"; # 2nd arg |
| 43 $num="%rdx"; # 3rd arg | 47 $num="%rdx"; # 3rd arg |
| 44 | 48 |
| 45 # reassign arguments in order to produce more compact code | 49 # reassign arguments in order to produce more compact code |
| 46 $ctx="%r8"; | 50 $ctx="%r8"; |
| 47 $inp="%r9"; | 51 $inp="%r9"; |
| 48 $num="%r10"; | 52 $num="%r10"; |
| 49 | 53 |
| (...skipping 12 matching lines...) Expand all Loading... |
| 62 sub PROLOGUE { | 66 sub PROLOGUE { |
| 63 my $func=shift; | 67 my $func=shift; |
| 64 $code.=<<___; | 68 $code.=<<___; |
| 65 .globl $func | 69 .globl $func |
| 66 .type $func,\@function,3 | 70 .type $func,\@function,3 |
| 67 .align 16 | 71 .align 16 |
| 68 $func: | 72 $func: |
| 69 push %rbx | 73 push %rbx |
| 70 push %rbp | 74 push %rbp |
| 71 push %r12 | 75 push %r12 |
| 72 » mov» %rsp,%rax | 76 » mov» %rsp,%r11 |
| 73 mov %rdi,$ctx # reassigned argument | 77 mov %rdi,$ctx # reassigned argument |
| 74 sub \$`8+16*4`,%rsp | 78 sub \$`8+16*4`,%rsp |
| 75 mov %rsi,$inp # reassigned argument | 79 mov %rsi,$inp # reassigned argument |
| 76 and \$-64,%rsp | 80 and \$-64,%rsp |
| 77 mov %rdx,$num # reassigned argument | 81 mov %rdx,$num # reassigned argument |
| 78 » mov» %rax,`16*4`(%rsp) | 82 » mov» %r11,`16*4`(%rsp) |
| 83 .Lprologue: |
| 79 | 84 |
| 80 mov 0($ctx),$A | 85 mov 0($ctx),$A |
| 81 mov 4($ctx),$B | 86 mov 4($ctx),$B |
| 82 mov 8($ctx),$C | 87 mov 8($ctx),$C |
| 83 mov 12($ctx),$D | 88 mov 12($ctx),$D |
| 84 mov 16($ctx),$E | 89 mov 16($ctx),$E |
| 85 ___ | 90 ___ |
| 86 } | 91 } |
| 87 | 92 |
| 88 sub EPILOGUE { | 93 sub EPILOGUE { |
| 89 my $func=shift; | 94 my $func=shift; |
| 90 $code.=<<___; | 95 $code.=<<___; |
| 91 » mov» `16*4`(%rsp),%rsp | 96 » mov» `16*4`(%rsp),%rsi |
| 92 » pop» %r12 | 97 » mov» (%rsi),%r12 |
| 93 » pop» %rbp | 98 » mov» 8(%rsi),%rbp |
| 94 » pop» %rbx | 99 » mov» 16(%rsi),%rbx |
| 100 » lea» 24(%rsi),%rsp |
| 101 .Lepilogue: |
| 95 ret | 102 ret |
| 96 .size $func,.-$func | 103 .size $func,.-$func |
| 97 ___ | 104 ___ |
| 98 } | 105 } |
| 99 | 106 |
| 100 sub BODY_00_19 { | 107 sub BODY_00_19 { |
| 101 my ($i,$a,$b,$c,$d,$e,$f,$host)=@_; | 108 my ($i,$a,$b,$c,$d,$e,$f,$host)=@_; |
| 102 my $j=$i+1; | 109 my $j=$i+1; |
| 103 $code.=<<___ if ($i==0); | 110 $code.=<<___ if ($i==0); |
| 104 mov `4*$i`($inp),$xi | 111 mov `4*$i`($inp),$xi |
| (...skipping 121 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 226 xchg $E,$C # mov $A,$C | 233 xchg $E,$C # mov $A,$C |
| 227 xchg $T,$D # mov $B,$D | 234 xchg $T,$D # mov $B,$D |
| 228 # mov $C,$E | 235 # mov $C,$E |
| 229 lea `16*4`($inp),$inp | 236 lea `16*4`($inp),$inp |
| 230 sub \$1,$num | 237 sub \$1,$num |
| 231 jnz .Lloop | 238 jnz .Lloop |
| 232 ___ | 239 ___ |
| 233 &EPILOGUE("sha1_block_data_order"); | 240 &EPILOGUE("sha1_block_data_order"); |
| 234 $code.=<<___; | 241 $code.=<<___; |
| 235 .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | 242 .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
| 243 .align 16 |
| 236 ___ | 244 ___ |
| 237 | 245 |
| 246 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, |
| 247 # CONTEXT *context,DISPATCHER_CONTEXT *disp) |
| 248 if ($win64) { |
| 249 $rec="%rcx"; |
| 250 $frame="%rdx"; |
| 251 $context="%r8"; |
| 252 $disp="%r9"; |
| 253 |
| 254 $code.=<<___; |
| 255 .extern __imp_RtlVirtualUnwind |
| 256 .type se_handler,\@abi-omnipotent |
| 257 .align 16 |
| 258 se_handler: |
| 259 push %rsi |
| 260 push %rdi |
| 261 push %rbx |
| 262 push %rbp |
| 263 push %r12 |
| 264 push %r13 |
| 265 push %r14 |
| 266 push %r15 |
| 267 pushfq |
| 268 sub \$64,%rsp |
| 269 |
| 270 mov 120($context),%rax # pull context->Rax |
| 271 mov 248($context),%rbx # pull context->Rip |
| 272 |
| 273 lea .Lprologue(%rip),%r10 |
| 274 cmp %r10,%rbx # context->Rip<.Lprologue |
| 275 jb .Lin_prologue |
| 276 |
| 277 mov 152($context),%rax # pull context->Rsp |
| 278 |
| 279 lea .Lepilogue(%rip),%r10 |
| 280 cmp %r10,%rbx # context->Rip>=.Lepilogue |
| 281 jae .Lin_prologue |
| 282 |
| 283 mov `16*4`(%rax),%rax # pull saved stack pointer |
| 284 lea 24(%rax),%rax |
| 285 |
| 286 mov -8(%rax),%rbx |
| 287 mov -16(%rax),%rbp |
| 288 mov -24(%rax),%r12 |
| 289 mov %rbx,144($context) # restore context->Rbx |
| 290 mov %rbp,160($context) # restore context->Rbp |
| 291 mov %r12,216($context) # restore context->R12 |
| 292 |
| 293 .Lin_prologue: |
| 294 mov 8(%rax),%rdi |
| 295 mov 16(%rax),%rsi |
| 296 mov %rax,152($context) # restore context->Rsp |
| 297 mov %rsi,168($context) # restore context->Rsi |
| 298 mov %rdi,176($context) # restore context->Rdi |
| 299 |
| 300 mov 40($disp),%rdi # disp->ContextRecord |
| 301 mov $context,%rsi # context |
| 302 mov \$154,%ecx # sizeof(CONTEXT) |
| 303 .long 0xa548f3fc # cld; rep movsq |
| 304 |
| 305 mov $disp,%rsi |
| 306 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER |
| 307 mov 8(%rsi),%rdx # arg2, disp->ImageBase |
| 308 mov 0(%rsi),%r8 # arg3, disp->ControlPc |
| 309 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry |
| 310 mov 40(%rsi),%r10 # disp->ContextRecord |
| 311 lea 56(%rsi),%r11 # &disp->HandlerData |
| 312 lea 24(%rsi),%r12 # &disp->EstablisherFrame |
| 313 mov %r10,32(%rsp) # arg5 |
| 314 mov %r11,40(%rsp) # arg6 |
| 315 mov %r12,48(%rsp) # arg7 |
| 316 mov %rcx,56(%rsp) # arg8, (NULL) |
| 317 call *__imp_RtlVirtualUnwind(%rip) |
| 318 |
| 319 mov \$1,%eax # ExceptionContinueSearch |
| 320 add \$64,%rsp |
| 321 popfq |
| 322 pop %r15 |
| 323 pop %r14 |
| 324 pop %r13 |
| 325 pop %r12 |
| 326 pop %rbp |
| 327 pop %rbx |
| 328 pop %rdi |
| 329 pop %rsi |
| 330 ret |
| 331 .size se_handler,.-se_handler |
| 332 |
| 333 .section .pdata |
| 334 .align 4 |
| 335 .rva .LSEH_begin_sha1_block_data_order |
| 336 .rva .LSEH_end_sha1_block_data_order |
| 337 .rva .LSEH_info_sha1_block_data_order |
| 338 |
| 339 .section .xdata |
| 340 .align 8 |
| 341 .LSEH_info_sha1_block_data_order: |
| 342 .byte 9,0,0,0 |
| 343 .rva se_handler |
| 344 ___ |
| 345 } |
| 346 |
| 238 #################################################################### | 347 #################################################################### |
| 239 | 348 |
| 240 $code =~ s/\`([^\`]*)\`/eval $1/gem; | 349 $code =~ s/\`([^\`]*)\`/eval $1/gem; |
| 241 print $code; | 350 print $code; |
| 242 close STDOUT; | 351 close STDOUT; |
| OLD | NEW |