OLD | NEW |
1 #!/usr/bin/perl -w | 1 #!/usr/bin/perl -w |
2 # | 2 # |
3 # MD5 optimized for AMD64. | 3 # MD5 optimized for AMD64. |
4 # | 4 # |
5 # Author: Marc Bevand <bevand_m (at) epita.fr> | 5 # Author: Marc Bevand <bevand_m (at) epita.fr> |
6 # Licence: I hereby disclaim the copyright on this code and place it | 6 # Licence: I hereby disclaim the copyright on this code and place it |
7 # in the public domain. | 7 # in the public domain. |
8 # | 8 # |
9 | 9 |
10 use strict; | 10 use strict; |
11 | 11 |
12 my $code; | 12 my $code; |
13 | 13 |
14 # round1_step() does: | 14 # round1_step() does: |
15 # dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s) | 15 # dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s) |
16 # %r10d = X[k_next] | 16 # %r10d = X[k_next] |
17 # %r11d = z' (copy of z for the next step) | 17 # %r11d = z' (copy of z for the next step) |
18 # Each round1_step() takes about 5.71 clocks (9 instructions, 1.58 IPC) | 18 # Each round1_step() takes about 5.3 clocks (9 instructions, 1.7 IPC) |
19 sub round1_step | 19 sub round1_step |
20 { | 20 { |
21 my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; | 21 my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; |
22 $T_i = unpack("l",pack("l", hex($T_i))); # convert to 32-bit signed decimal | |
23 $code .= " mov 0*4(%rsi), %r10d /* (NEXT STEP) X[0] */\n
" if ($pos == -1); | 22 $code .= " mov 0*4(%rsi), %r10d /* (NEXT STEP) X[0] */\n
" if ($pos == -1); |
24 $code .= " mov %edx, %r11d /* (NEXT STEP) z' = %edx
*/\n" if ($pos == -1); | 23 $code .= " mov %edx, %r11d /* (NEXT STEP) z' = %edx
*/\n" if ($pos == -1); |
25 $code .= <<EOF; | 24 $code .= <<EOF; |
26 xor $y, %r11d /* y ^ ... */ | 25 xor $y, %r11d /* y ^ ... */ |
27 lea $T_i($dst,%r10d),$dst /* Const + dst + ... */ | 26 lea $T_i($dst,%r10d),$dst /* Const + dst + ... */ |
28 and $x, %r11d /* x & ... */ | 27 and $x, %r11d /* x & ... */ |
29 xor $z, %r11d /* z ^ ... */ | 28 xor $z, %r11d /* z ^ ... */ |
30 mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */ | 29 mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */ |
31 add %r11d, $dst /* dst += ... */ | 30 add %r11d, $dst /* dst += ... */ |
32 rol \$$s, $dst /* dst <<< s */ | 31 rol \$$s, $dst /* dst <<< s */ |
33 mov $y, %r11d /* (NEXT STEP) z' = $y */ | 32 mov $y, %r11d /* (NEXT STEP) z' = $y */ |
34 add $x, $dst /* dst += x */ | 33 add $x, $dst /* dst += x */ |
35 EOF | 34 EOF |
36 } | 35 } |
37 | 36 |
38 # round2_step() does: | 37 # round2_step() does: |
39 # dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s) | 38 # dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s) |
40 # %r10d = X[k_next] | 39 # %r10d = X[k_next] |
41 # %r11d = y' (copy of y for the next step) | 40 # %r11d = z' (copy of z for the next step) |
42 # Each round2_step() takes about 6.22 clocks (9 instructions, 1.45 IPC) | 41 # %r12d = z' (copy of z for the next step) |
| 42 # Each round2_step() takes about 5.4 clocks (11 instructions, 2.0 IPC) |
43 sub round2_step | 43 sub round2_step |
44 { | 44 { |
45 my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; | 45 my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; |
46 $T_i = unpack("l",pack("l", hex($T_i))); # convert to 32-bit signed decimal | |
47 $code .= " mov 1*4(%rsi), %r10d /* (NEXT STEP) X[1] */\n
" if ($pos == -1); | 46 $code .= " mov 1*4(%rsi), %r10d /* (NEXT STEP) X[1] */\n
" if ($pos == -1); |
48 $code .= " mov» %ecx,» » %r11d» » /* (NEXT STEP) y' = %ecx
*/\n" if ($pos == -1); | 47 $code .= " mov» %edx,» » %r11d» » /* (NEXT STEP) z' = %edx
*/\n" if ($pos == -1); |
| 48 $code .= " mov» %edx,» » %r12d» » /* (NEXT STEP) z' = %edx
*/\n" if ($pos == -1); |
49 $code .= <<EOF; | 49 $code .= <<EOF; |
50 » xor» $x,» » %r11d» » /* x ^ ... */ | 50 » not» %r11d» » » » /* not z */ |
51 lea $T_i($dst,%r10d),$dst /* Const + dst + ... */ | 51 lea $T_i($dst,%r10d),$dst /* Const + dst + ... */ |
52 » and» $z,» » %r11d» » /* z & ... */ | 52 » and» $x,» » %r12d» » /* x & z */ |
53 » xor» $y,» » %r11d» » /* y ^ ... */ | 53 » and» $y,» » %r11d» » /* y & (not z) */ |
54 mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */ | 54 mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */ |
55 » add» %r11d,» » $dst» » /* dst += ... */ | 55 » or» %r11d,» » %r12d» » /* (y & (not z)) | (x & z) */ |
| 56 » mov» $y,» » %r11d» » /* (NEXT STEP) z' = $y */ |
| 57 » add» %r12d,» » $dst» » /* dst += ... */ |
| 58 » mov» $y,» » %r12d» » /* (NEXT STEP) z' = $y */ |
56 rol \$$s, $dst /* dst <<< s */ | 59 rol \$$s, $dst /* dst <<< s */ |
57 mov $x, %r11d /* (NEXT STEP) y' = $x */ | |
58 add $x, $dst /* dst += x */ | 60 add $x, $dst /* dst += x */ |
59 EOF | 61 EOF |
60 } | 62 } |
61 | 63 |
62 # round3_step() does: | 64 # round3_step() does: |
63 # dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s) | 65 # dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s) |
64 # %r10d = X[k_next] | 66 # %r10d = X[k_next] |
65 # %r11d = y' (copy of y for the next step) | 67 # %r11d = y' (copy of y for the next step) |
66 # Each round3_step() takes about 4.26 clocks (8 instructions, 1.88 IPC) | 68 # Each round3_step() takes about 4.2 clocks (8 instructions, 1.9 IPC) |
67 sub round3_step | 69 sub round3_step |
68 { | 70 { |
69 my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; | 71 my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; |
70 $T_i = unpack("l",pack("l", hex($T_i))); # convert to 32-bit signed decimal | |
71 $code .= " mov 5*4(%rsi), %r10d /* (NEXT STEP) X[5] */\n
" if ($pos == -1); | 72 $code .= " mov 5*4(%rsi), %r10d /* (NEXT STEP) X[5] */\n
" if ($pos == -1); |
72 $code .= " mov %ecx, %r11d /* (NEXT STEP) y' = %ecx
*/\n" if ($pos == -1); | 73 $code .= " mov %ecx, %r11d /* (NEXT STEP) y' = %ecx
*/\n" if ($pos == -1); |
73 $code .= <<EOF; | 74 $code .= <<EOF; |
74 lea $T_i($dst,%r10d),$dst /* Const + dst + ... */ | 75 lea $T_i($dst,%r10d),$dst /* Const + dst + ... */ |
75 mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */ | 76 mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */ |
76 xor $z, %r11d /* z ^ ... */ | 77 xor $z, %r11d /* z ^ ... */ |
77 xor $x, %r11d /* x ^ ... */ | 78 xor $x, %r11d /* x ^ ... */ |
78 add %r11d, $dst /* dst += ... */ | 79 add %r11d, $dst /* dst += ... */ |
79 rol \$$s, $dst /* dst <<< s */ | 80 rol \$$s, $dst /* dst <<< s */ |
80 mov $x, %r11d /* (NEXT STEP) y' = $x */ | 81 mov $x, %r11d /* (NEXT STEP) y' = $x */ |
81 add $x, $dst /* dst += x */ | 82 add $x, $dst /* dst += x */ |
82 EOF | 83 EOF |
83 } | 84 } |
84 | 85 |
85 # round4_step() does: | 86 # round4_step() does: |
86 # dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s) | 87 # dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s) |
87 # %r10d = X[k_next] | 88 # %r10d = X[k_next] |
88 # %r11d = not z' (copy of not z for the next step) | 89 # %r11d = not z' (copy of not z for the next step) |
89 # Each round4_step() takes about 5.27 clocks (9 instructions, 1.71 IPC) | 90 # Each round4_step() takes about 5.2 clocks (9 instructions, 1.7 IPC) |
90 sub round4_step | 91 sub round4_step |
91 { | 92 { |
92 my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; | 93 my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; |
93 $T_i = unpack("l",pack("l", hex($T_i))); # convert to 32-bit signed decimal | |
94 $code .= " mov 0*4(%rsi), %r10d /* (NEXT STEP) X[0] */\n
" if ($pos == -1); | 94 $code .= " mov 0*4(%rsi), %r10d /* (NEXT STEP) X[0] */\n
" if ($pos == -1); |
95 $code .= " mov \$0xffffffff, %r11d\n" if ($pos == -1); | 95 $code .= " mov \$0xffffffff, %r11d\n" if ($pos == -1); |
96 $code .= " xor %edx, %r11d /* (NEXT STEP) not z' =
not %edx*/\n" | 96 $code .= " xor %edx, %r11d /* (NEXT STEP) not z' =
not %edx*/\n" |
97 if ($pos == -1); | 97 if ($pos == -1); |
98 $code .= <<EOF; | 98 $code .= <<EOF; |
99 lea $T_i($dst,%r10d),$dst /* Const + dst + ... */ | 99 lea $T_i($dst,%r10d),$dst /* Const + dst + ... */ |
100 or $x, %r11d /* x | ... */ | 100 or $x, %r11d /* x | ... */ |
101 xor $y, %r11d /* y ^ ... */ | 101 xor $y, %r11d /* y ^ ... */ |
102 add %r11d, $dst /* dst += ... */ | 102 add %r11d, $dst /* dst += ... */ |
103 mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */ | 103 mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */ |
104 mov \$0xffffffff, %r11d | 104 mov \$0xffffffff, %r11d |
105 rol \$$s, $dst /* dst <<< s */ | 105 rol \$$s, $dst /* dst <<< s */ |
106 xor $y, %r11d /* (NEXT STEP) not z' = not $y *
/ | 106 xor $y, %r11d /* (NEXT STEP) not z' = not $y *
/ |
107 add $x, $dst /* dst += x */ | 107 add $x, $dst /* dst += x */ |
108 EOF | 108 EOF |
109 } | 109 } |
110 | 110 |
111 my $output = shift; | 111 my $flavour = shift; |
112 open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output"; | 112 my $output = shift; |
| 113 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } |
| 114 |
| 115 my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); |
| 116 |
| 117 $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate; |
| 118 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or |
| 119 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
| 120 die "can't locate x86_64-xlate.pl"; |
| 121 |
| 122 no warnings qw(uninitialized); |
| 123 open STDOUT,"| $^X $xlate $flavour $output"; |
113 | 124 |
114 $code .= <<EOF; | 125 $code .= <<EOF; |
115 .text | 126 .text |
116 .align 16 | 127 .align 16 |
117 | 128 |
118 .globl md5_block_asm_data_order | 129 .globl md5_block_asm_data_order |
119 .type md5_block_asm_data_order,\@function,3 | 130 .type md5_block_asm_data_order,\@function,3 |
120 md5_block_asm_data_order: | 131 md5_block_asm_data_order: |
121 push %rbp | 132 push %rbp |
122 push %rbx | 133 push %rbx |
| 134 push %r12 |
123 push %r14 | 135 push %r14 |
124 push %r15 | 136 push %r15 |
| 137 .Lprologue: |
125 | 138 |
126 # rdi = arg #1 (ctx, MD5_CTX pointer) | 139 # rdi = arg #1 (ctx, MD5_CTX pointer) |
127 # rsi = arg #2 (ptr, data pointer) | 140 # rsi = arg #2 (ptr, data pointer) |
128 # rdx = arg #3 (nbr, number of 16-word blocks to process) | 141 # rdx = arg #3 (nbr, number of 16-word blocks to process) |
129 mov %rdi, %rbp # rbp = ctx | 142 mov %rdi, %rbp # rbp = ctx |
130 shl \$6, %rdx # rdx = nbr in bytes | 143 shl \$6, %rdx # rdx = nbr in bytes |
131 lea (%rsi,%rdx), %rdi # rdi = end | 144 lea (%rsi,%rdx), %rdi # rdi = end |
132 mov 0*4(%rbp), %eax # eax = ctx->A | 145 mov 0*4(%rbp), %eax # eax = ctx->A |
133 mov 1*4(%rbp), %ebx # ebx = ctx->B | 146 mov 1*4(%rbp), %ebx # ebx = ctx->B |
134 mov 2*4(%rbp), %ecx # ecx = ctx->C | 147 mov 2*4(%rbp), %ecx # ecx = ctx->C |
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
229 cmp %rdi, %rsi # cmp end with ptr | 242 cmp %rdi, %rsi # cmp end with ptr |
230 jb .Lloop # jmp if ptr < end | 243 jb .Lloop # jmp if ptr < end |
231 # END of loop over 16-word blocks | 244 # END of loop over 16-word blocks |
232 | 245 |
233 .Lend: | 246 .Lend: |
234 mov %eax, 0*4(%rbp) # ctx->A = A | 247 mov %eax, 0*4(%rbp) # ctx->A = A |
235 mov %ebx, 1*4(%rbp) # ctx->B = B | 248 mov %ebx, 1*4(%rbp) # ctx->B = B |
236 mov %ecx, 2*4(%rbp) # ctx->C = C | 249 mov %ecx, 2*4(%rbp) # ctx->C = C |
237 mov %edx, 3*4(%rbp) # ctx->D = D | 250 mov %edx, 3*4(%rbp) # ctx->D = D |
238 | 251 |
239 » pop» %r15 | 252 » mov» (%rsp),%r15 |
240 » pop» %r14 | 253 » mov» 8(%rsp),%r14 |
241 » pop» %rbx | 254 » mov» 16(%rsp),%r12 |
242 » pop» %rbp | 255 » mov» 24(%rsp),%rbx |
| 256 » mov» 32(%rsp),%rbp |
| 257 » add» \$40,%rsp |
| 258 .Lepilogue: |
243 ret | 259 ret |
244 .size md5_block_asm_data_order,.-md5_block_asm_data_order | 260 .size md5_block_asm_data_order,.-md5_block_asm_data_order |
245 EOF | 261 EOF |
246 | 262 |
| 263 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, |
| 264 # CONTEXT *context,DISPATCHER_CONTEXT *disp) |
| 265 if ($win64) { |
| 266 my $rec="%rcx"; |
| 267 my $frame="%rdx"; |
| 268 my $context="%r8"; |
| 269 my $disp="%r9"; |
| 270 |
| 271 $code.=<<___; |
| 272 .extern __imp_RtlVirtualUnwind |
| 273 .type se_handler,\@abi-omnipotent |
| 274 .align 16 |
| 275 se_handler: |
| 276 push %rsi |
| 277 push %rdi |
| 278 push %rbx |
| 279 push %rbp |
| 280 push %r12 |
| 281 push %r13 |
| 282 push %r14 |
| 283 push %r15 |
| 284 pushfq |
| 285 sub \$64,%rsp |
| 286 |
| 287 mov 120($context),%rax # pull context->Rax |
| 288 mov 248($context),%rbx # pull context->Rip |
| 289 |
| 290 lea .Lprologue(%rip),%r10 |
| 291 cmp %r10,%rbx # context->Rip<.Lprologue |
| 292 jb .Lin_prologue |
| 293 |
| 294 mov 152($context),%rax # pull context->Rsp |
| 295 |
| 296 lea .Lepilogue(%rip),%r10 |
| 297 cmp %r10,%rbx # context->Rip>=.Lepilogue |
| 298 jae .Lin_prologue |
| 299 |
| 300 lea 40(%rax),%rax |
| 301 |
| 302 mov -8(%rax),%rbp |
| 303 mov -16(%rax),%rbx |
| 304 mov -24(%rax),%r12 |
| 305 mov -32(%rax),%r14 |
| 306 mov -40(%rax),%r15 |
| 307 mov %rbx,144($context) # restore context->Rbx |
| 308 mov %rbp,160($context) # restore context->Rbp |
| 309 mov %r12,216($context) # restore context->R12 |
| 310 mov %r14,232($context) # restore context->R14 |
| 311 mov %r15,240($context) # restore context->R15 |
| 312 |
| 313 .Lin_prologue: |
| 314 mov 8(%rax),%rdi |
| 315 mov 16(%rax),%rsi |
| 316 mov %rax,152($context) # restore context->Rsp |
| 317 mov %rsi,168($context) # restore context->Rsi |
| 318 mov %rdi,176($context) # restore context->Rdi |
| 319 |
| 320 mov 40($disp),%rdi # disp->ContextRecord |
| 321 mov $context,%rsi # context |
| 322 mov \$154,%ecx # sizeof(CONTEXT) |
| 323 .long 0xa548f3fc # cld; rep movsq |
| 324 |
| 325 mov $disp,%rsi |
| 326 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER |
| 327 mov 8(%rsi),%rdx # arg2, disp->ImageBase |
| 328 mov 0(%rsi),%r8 # arg3, disp->ControlPc |
| 329 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry |
| 330 mov 40(%rsi),%r10 # disp->ContextRecord |
| 331 lea 56(%rsi),%r11 # &disp->HandlerData |
| 332 lea 24(%rsi),%r12 # &disp->EstablisherFrame |
| 333 mov %r10,32(%rsp) # arg5 |
| 334 mov %r11,40(%rsp) # arg6 |
| 335 mov %r12,48(%rsp) # arg7 |
| 336 mov %rcx,56(%rsp) # arg8, (NULL) |
| 337 call *__imp_RtlVirtualUnwind(%rip) |
| 338 |
| 339 mov \$1,%eax # ExceptionContinueSearch |
| 340 add \$64,%rsp |
| 341 popfq |
| 342 pop %r15 |
| 343 pop %r14 |
| 344 pop %r13 |
| 345 pop %r12 |
| 346 pop %rbp |
| 347 pop %rbx |
| 348 pop %rdi |
| 349 pop %rsi |
| 350 ret |
| 351 .size se_handler,.-se_handler |
| 352 |
| 353 .section .pdata |
| 354 .align 4 |
| 355 .rva .LSEH_begin_md5_block_asm_data_order |
| 356 .rva .LSEH_end_md5_block_asm_data_order |
| 357 .rva .LSEH_info_md5_block_asm_data_order |
| 358 |
| 359 .section .xdata |
| 360 .align 8 |
| 361 .LSEH_info_md5_block_asm_data_order: |
| 362 .byte 9,0,0,0 |
| 363 .rva se_handler |
| 364 ___ |
| 365 } |
| 366 |
247 print $code; | 367 print $code; |
248 | 368 |
249 close STDOUT; | 369 close STDOUT; |
OLD | NEW |