OLD | NEW |
1 #!/usr/bin/env perl | 1 #!/usr/bin/env perl |
2 # | 2 # |
3 # ==================================================================== | 3 # ==================================================================== |
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | 4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
5 # project. The module is, however, dual licensed under OpenSSL and | 5 # project. The module is, however, dual licensed under OpenSSL and |
6 # CRYPTOGAMS licenses depending on where you obtain it. For further | 6 # CRYPTOGAMS licenses depending on where you obtain it. For further |
7 # details see http://www.openssl.org/~appro/cryptogams/. | 7 # details see http://www.openssl.org/~appro/cryptogams/. |
8 # ==================================================================== | 8 # ==================================================================== |
9 # | 9 # |
10 # sha1_block procedure for x86_64. | 10 # sha1_block procedure for x86_64. |
(...skipping 11 matching lines...) Expand all Loading... |
22 # reaches for even more registers through dynamic aliasing, and EM64T | 22 # reaches for even more registers through dynamic aliasing, and EM64T |
23 # core must have managed to run-time optimize even 32-bit code just as | 23 # core must have managed to run-time optimize even 32-bit code just as |
24 # good as 64-bit one. Performance improvement is summarized in the | 24 # good as 64-bit one. Performance improvement is summarized in the |
25 # following table: | 25 # following table: |
26 # | 26 # |
27 # gcc 3.4 32-bit asm cycles/byte | 27 # gcc 3.4 32-bit asm cycles/byte |
28 # Opteron +45% +20% 6.8 | 28 # Opteron +45% +20% 6.8 |
29 # Xeon P4 +65% +0% 9.9 | 29 # Xeon P4 +65% +0% 9.9 |
30 # Core2 +60% +10% 7.0 | 30 # Core2 +60% +10% 7.0 |
31 | 31 |
32 $output=shift; | 32 $flavour = shift; |
| 33 $output = shift; |
| 34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } |
| 35 |
| 36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); |
33 | 37 |
34 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | 38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
35 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | 39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or |
36 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | 40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
37 die "can't locate x86_64-xlate.pl"; | 41 die "can't locate x86_64-xlate.pl"; |
38 | 42 |
39 open STDOUT,"| $^X $xlate $output"; | 43 open STDOUT,"| $^X $xlate $flavour $output"; |
40 | 44 |
41 $ctx="%rdi"; # 1st arg | 45 $ctx="%rdi"; # 1st arg |
42 $inp="%rsi"; # 2nd arg | 46 $inp="%rsi"; # 2nd arg |
43 $num="%rdx"; # 3rd arg | 47 $num="%rdx"; # 3rd arg |
44 | 48 |
45 # reassign arguments in order to produce more compact code | 49 # reassign arguments in order to produce more compact code |
46 $ctx="%r8"; | 50 $ctx="%r8"; |
47 $inp="%r9"; | 51 $inp="%r9"; |
48 $num="%r10"; | 52 $num="%r10"; |
49 | 53 |
(...skipping 12 matching lines...) Expand all Loading... |
62 sub PROLOGUE { | 66 sub PROLOGUE { |
63 my $func=shift; | 67 my $func=shift; |
64 $code.=<<___; | 68 $code.=<<___; |
65 .globl $func | 69 .globl $func |
66 .type $func,\@function,3 | 70 .type $func,\@function,3 |
67 .align 16 | 71 .align 16 |
68 $func: | 72 $func: |
69 push %rbx | 73 push %rbx |
70 push %rbp | 74 push %rbp |
71 push %r12 | 75 push %r12 |
72 » mov» %rsp,%rax | 76 » mov» %rsp,%r11 |
73 mov %rdi,$ctx # reassigned argument | 77 mov %rdi,$ctx # reassigned argument |
74 sub \$`8+16*4`,%rsp | 78 sub \$`8+16*4`,%rsp |
75 mov %rsi,$inp # reassigned argument | 79 mov %rsi,$inp # reassigned argument |
76 and \$-64,%rsp | 80 and \$-64,%rsp |
77 mov %rdx,$num # reassigned argument | 81 mov %rdx,$num # reassigned argument |
78 » mov» %rax,`16*4`(%rsp) | 82 » mov» %r11,`16*4`(%rsp) |
| 83 .Lprologue: |
79 | 84 |
80 mov 0($ctx),$A | 85 mov 0($ctx),$A |
81 mov 4($ctx),$B | 86 mov 4($ctx),$B |
82 mov 8($ctx),$C | 87 mov 8($ctx),$C |
83 mov 12($ctx),$D | 88 mov 12($ctx),$D |
84 mov 16($ctx),$E | 89 mov 16($ctx),$E |
85 ___ | 90 ___ |
86 } | 91 } |
87 | 92 |
88 sub EPILOGUE { | 93 sub EPILOGUE { |
89 my $func=shift; | 94 my $func=shift; |
90 $code.=<<___; | 95 $code.=<<___; |
91 » mov» `16*4`(%rsp),%rsp | 96 » mov» `16*4`(%rsp),%rsi |
92 » pop» %r12 | 97 » mov» (%rsi),%r12 |
93 » pop» %rbp | 98 » mov» 8(%rsi),%rbp |
94 » pop» %rbx | 99 » mov» 16(%rsi),%rbx |
| 100 » lea» 24(%rsi),%rsp |
| 101 .Lepilogue: |
95 ret | 102 ret |
96 .size $func,.-$func | 103 .size $func,.-$func |
97 ___ | 104 ___ |
98 } | 105 } |
99 | 106 |
100 sub BODY_00_19 { | 107 sub BODY_00_19 { |
101 my ($i,$a,$b,$c,$d,$e,$f,$host)=@_; | 108 my ($i,$a,$b,$c,$d,$e,$f,$host)=@_; |
102 my $j=$i+1; | 109 my $j=$i+1; |
103 $code.=<<___ if ($i==0); | 110 $code.=<<___ if ($i==0); |
104 mov `4*$i`($inp),$xi | 111 mov `4*$i`($inp),$xi |
(...skipping 121 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
226 xchg $E,$C # mov $A,$C | 233 xchg $E,$C # mov $A,$C |
227 xchg $T,$D # mov $B,$D | 234 xchg $T,$D # mov $B,$D |
228 # mov $C,$E | 235 # mov $C,$E |
229 lea `16*4`($inp),$inp | 236 lea `16*4`($inp),$inp |
230 sub \$1,$num | 237 sub \$1,$num |
231 jnz .Lloop | 238 jnz .Lloop |
232 ___ | 239 ___ |
233 &EPILOGUE("sha1_block_data_order"); | 240 &EPILOGUE("sha1_block_data_order"); |
234 $code.=<<___; | 241 $code.=<<___; |
235 .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | 242 .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
| 243 .align 16 |
236 ___ | 244 ___ |
237 | 245 |
| 246 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, |
| 247 # CONTEXT *context,DISPATCHER_CONTEXT *disp) |
| 248 if ($win64) { |
| 249 $rec="%rcx"; |
| 250 $frame="%rdx"; |
| 251 $context="%r8"; |
| 252 $disp="%r9"; |
| 253 |
| 254 $code.=<<___; |
| 255 .extern __imp_RtlVirtualUnwind |
| 256 .type se_handler,\@abi-omnipotent |
| 257 .align 16 |
| 258 se_handler: |
| 259 push %rsi |
| 260 push %rdi |
| 261 push %rbx |
| 262 push %rbp |
| 263 push %r12 |
| 264 push %r13 |
| 265 push %r14 |
| 266 push %r15 |
| 267 pushfq |
| 268 sub \$64,%rsp |
| 269 |
| 270 mov 120($context),%rax # pull context->Rax |
| 271 mov 248($context),%rbx # pull context->Rip |
| 272 |
| 273 lea .Lprologue(%rip),%r10 |
| 274 cmp %r10,%rbx # context->Rip<.Lprologue |
| 275 jb .Lin_prologue |
| 276 |
| 277 mov 152($context),%rax # pull context->Rsp |
| 278 |
| 279 lea .Lepilogue(%rip),%r10 |
| 280 cmp %r10,%rbx # context->Rip>=.Lepilogue |
| 281 jae .Lin_prologue |
| 282 |
| 283 mov `16*4`(%rax),%rax # pull saved stack pointer |
| 284 lea 24(%rax),%rax |
| 285 |
| 286 mov -8(%rax),%rbx |
| 287 mov -16(%rax),%rbp |
| 288 mov -24(%rax),%r12 |
| 289 mov %rbx,144($context) # restore context->Rbx |
| 290 mov %rbp,160($context) # restore context->Rbp |
| 291 mov %r12,216($context) # restore context->R12 |
| 292 |
| 293 .Lin_prologue: |
| 294 mov 8(%rax),%rdi |
| 295 mov 16(%rax),%rsi |
| 296 mov %rax,152($context) # restore context->Rsp |
| 297 mov %rsi,168($context) # restore context->Rsi |
| 298 mov %rdi,176($context) # restore context->Rdi |
| 299 |
| 300 mov 40($disp),%rdi # disp->ContextRecord |
| 301 mov $context,%rsi # context |
| 302 mov \$154,%ecx # sizeof(CONTEXT) |
| 303 .long 0xa548f3fc # cld; rep movsq |
| 304 |
| 305 mov $disp,%rsi |
| 306 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER |
| 307 mov 8(%rsi),%rdx # arg2, disp->ImageBase |
| 308 mov 0(%rsi),%r8 # arg3, disp->ControlPc |
| 309 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry |
| 310 mov 40(%rsi),%r10 # disp->ContextRecord |
| 311 lea 56(%rsi),%r11 # &disp->HandlerData |
| 312 lea 24(%rsi),%r12 # &disp->EstablisherFrame |
| 313 mov %r10,32(%rsp) # arg5 |
| 314 mov %r11,40(%rsp) # arg6 |
| 315 mov %r12,48(%rsp) # arg7 |
| 316 mov %rcx,56(%rsp) # arg8, (NULL) |
| 317 call *__imp_RtlVirtualUnwind(%rip) |
| 318 |
| 319 mov \$1,%eax # ExceptionContinueSearch |
| 320 add \$64,%rsp |
| 321 popfq |
| 322 pop %r15 |
| 323 pop %r14 |
| 324 pop %r13 |
| 325 pop %r12 |
| 326 pop %rbp |
| 327 pop %rbx |
| 328 pop %rdi |
| 329 pop %rsi |
| 330 ret |
| 331 .size se_handler,.-se_handler |
| 332 |
| 333 .section .pdata |
| 334 .align 4 |
| 335 .rva .LSEH_begin_sha1_block_data_order |
| 336 .rva .LSEH_end_sha1_block_data_order |
| 337 .rva .LSEH_info_sha1_block_data_order |
| 338 |
| 339 .section .xdata |
| 340 .align 8 |
| 341 .LSEH_info_sha1_block_data_order: |
| 342 .byte 9,0,0,0 |
| 343 .rva se_handler |
| 344 ___ |
| 345 } |
| 346 |
238 #################################################################### | 347 #################################################################### |
239 | 348 |
240 $code =~ s/\`([^\`]*)\`/eval $1/gem; | 349 $code =~ s/\`([^\`]*)\`/eval $1/gem; |
241 print $code; | 350 print $code; |
242 close STDOUT; | 351 close STDOUT; |
OLD | NEW |