Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(70)

Side by Side Diff: openssl/crypto/camellia/asm/cmll-x86_64.pl

Issue 9254031: Upgrade chrome's OpenSSL to same version Android ships with. (Closed) Base URL: http://src.chromium.org/svn/trunk/deps/third_party/openssl/
Patch Set: '' Created 8 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « openssl/crypto/camellia/asm/cmll-x86.pl ('k') | openssl/crypto/camellia/camellia.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
5 #
6 # This module may be used under the terms of either the GNU General
7 # Public License version 2 or later, the GNU Lesser General Public
8 # License version 2.1 or later, the Mozilla Public License version
9 # 1.1 or the BSD License. The exact terms of either license are
10 # distributed along with this module. For further details see
11 # http://www.openssl.org/~appro/camellia/.
12 # ====================================================================
13
14 # Performance in cycles per processed byte (less is better) in
15 # 'openssl speed ...' benchmark:
16 #
17 # AMD64 Core2 EM64T
18 # -evp camellia-128-ecb 16.7 21.0 22.7
19 # + over gcc 3.4.6 +25% +5% 0%
20 #
21 # camellia-128-cbc 15.7 20.4 21.1
22 #
23 # 128-bit key setup 128 216 205 cycles/key
24 # + over gcc 3.4.6 +54% +39% +15%
25 #
26 # Numbers in "+" rows represent performance improvement over compiler
27 # generated code. Key setup timings are impressive on AMD and Core2
28 # thanks to 64-bit operations being covertly deployed. Improvement on
29 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
30 # apparently emulates some of 64-bit operations in [32-bit] microcode.
31
32 $flavour = shift;
33 $output = shift;
34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41 die "can't locate x86_64-xlate.pl";
42
43 open STDOUT,"| $^X $xlate $flavour $output";
44
45 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
46 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
47 $r =~ s/%[er]([sd]i)/%\1l/;
48 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
49
50 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
51 @S=("%r8d","%r9d","%r10d","%r11d");
52 $i0="%esi";
53 $i1="%edi";
54 $Tbl="%rbp"; # size optimization
55 $inp="%r12";
56 $out="%r13";
57 $key="%r14";
58 $keyend="%r15";
59 $arg0d=$win64?"%ecx":"%edi";
60
61 # const unsigned int Camellia_SBOX[4][256];
62 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
63 # and [2][] - with [3][]. This is done to minimize code size.
64 $SBOX1_1110=0; # Camellia_SBOX[0]
65 $SBOX4_4404=4; # Camellia_SBOX[1]
66 $SBOX2_0222=2048; # Camellia_SBOX[2]
67 $SBOX3_3033=2052; # Camellia_SBOX[3]
68
69 sub Camellia_Feistel {
70 my $i=@_[0];
71 my $seed=defined(@_[1])?@_[1]:0;
72 my $scale=$seed<0?-8:8;
73 my $j=($i&1)*2;
74 my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
75
76 $code.=<<___;
77 xor $s0,$t0 # t0^=key[0]
78 xor $s1,$t1 # t1^=key[1]
79 movz `&hi("$t0")`,$i0 # (t0>>8)&0xff
80 movz `&lo("$t1")`,$i1 # (t1>>0)&0xff
81 mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0]
82 mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1]
83 movz `&lo("$t0")`,$i0 # (t0>>0)&0xff
84 shr \$16,$t0
85 movz `&hi("$t1")`,$i1 # (t1>>8)&0xff
86 xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0]
87 shr \$16,$t1
88 xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1]
89 movz `&hi("$t0")`,$i0 # (t0>>24)&0xff
90 movz `&lo("$t1")`,$i1 # (t1>>16)&0xff
91 xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0]
92 xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1]
93 movz `&lo("$t0")`,$i0 # (t0>>16)&0xff
94 movz `&hi("$t1")`,$i1 # (t1>>24)&0xff
95 xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0]
96 xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1]
97 mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
98 mov `$seed+($i+1)*$scale+4`($key),$t0
99 xor $t3,$t2 # t2^=t3
100 ror \$8,$t3 # t3=RightRotate(t3,8)
101 xor $t2,$s2
102 xor $t2,$s3
103 xor $t3,$s3
104 ___
105 }
106
107 # void Camellia_EncryptBlock_Rounds(
108 # int grandRounds,
109 # const Byte plaintext[],
110 # const KEY_TABLE_TYPE keyTable,
111 # Byte ciphertext[])
112 $code=<<___;
113 .text
114
115 # V1.x API
116 .globl Camellia_EncryptBlock
117 .type Camellia_EncryptBlock,\@abi-omnipotent
118 .align 16
119 Camellia_EncryptBlock:
120 movl \$128,%eax
121 subl $arg0d,%eax
122 movl \$3,$arg0d
123 adcl \$0,$arg0d # keyBitLength==128?3:4
124 jmp .Lenc_rounds
125 .size Camellia_EncryptBlock,.-Camellia_EncryptBlock
126 # V2
127 .globl Camellia_EncryptBlock_Rounds
128 .type Camellia_EncryptBlock_Rounds,\@function,4
129 .align 16
130 .Lenc_rounds:
131 Camellia_EncryptBlock_Rounds:
132 push %rbx
133 push %rbp
134 push %r13
135 push %r14
136 push %r15
137 .Lenc_prologue:
138
139 #mov %rsi,$inp # put away arguments
140 mov %rcx,$out
141 mov %rdx,$key
142
143 shl \$6,%edi # process grandRounds
144 lea .LCamellia_SBOX(%rip),$Tbl
145 lea ($key,%rdi),$keyend
146
147 mov 0(%rsi),@S[0] # load plaintext
148 mov 4(%rsi),@S[1]
149 mov 8(%rsi),@S[2]
150 bswap @S[0]
151 mov 12(%rsi),@S[3]
152 bswap @S[1]
153 bswap @S[2]
154 bswap @S[3]
155
156 call _x86_64_Camellia_encrypt
157
158 bswap @S[0]
159 bswap @S[1]
160 bswap @S[2]
161 mov @S[0],0($out)
162 bswap @S[3]
163 mov @S[1],4($out)
164 mov @S[2],8($out)
165 mov @S[3],12($out)
166
167 mov 0(%rsp),%r15
168 mov 8(%rsp),%r14
169 mov 16(%rsp),%r13
170 mov 24(%rsp),%rbp
171 mov 32(%rsp),%rbx
172 lea 40(%rsp),%rsp
173 .Lenc_epilogue:
174 ret
175 .size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
176
177 .type _x86_64_Camellia_encrypt,\@abi-omnipotent
178 .align 16
179 _x86_64_Camellia_encrypt:
180 xor 0($key),@S[1]
181 xor 4($key),@S[0] # ^=key[0-3]
182 xor 8($key),@S[3]
183 xor 12($key),@S[2]
184 .align 16
185 .Leloop:
186 mov 16($key),$t1 # prefetch key[4-5]
187 mov 20($key),$t0
188
189 ___
190 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
191 $code.=<<___;
192 lea 16*4($key),$key
193 cmp $keyend,$key
194 mov 8($key),$t3 # prefetch key[2-3]
195 mov 12($key),$t2
196 je .Ledone
197
198 and @S[0],$t0
199 or @S[3],$t3
200 rol \$1,$t0
201 xor $t3,@S[2] # s2^=s3|key[3];
202 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
203 and @S[2],$t2
204 or @S[1],$t1
205 rol \$1,$t2
206 xor $t1,@S[0] # s0^=s1|key[1];
207 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
208 jmp .Leloop
209
210 .align 16
211 .Ledone:
212 xor @S[2],$t0 # SwapHalf
213 xor @S[3],$t1
214 xor @S[0],$t2
215 xor @S[1],$t3
216
217 mov $t0,@S[0]
218 mov $t1,@S[1]
219 mov $t2,@S[2]
220 mov $t3,@S[3]
221
222 .byte 0xf3,0xc3 # rep ret
223 .size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
224
225 # V1.x API
226 .globl Camellia_DecryptBlock
227 .type Camellia_DecryptBlock,\@abi-omnipotent
228 .align 16
229 Camellia_DecryptBlock:
230 movl \$128,%eax
231 subl $arg0d,%eax
232 movl \$3,$arg0d
233 adcl \$0,$arg0d # keyBitLength==128?3:4
234 jmp .Ldec_rounds
235 .size Camellia_DecryptBlock,.-Camellia_DecryptBlock
236 # V2
237 .globl Camellia_DecryptBlock_Rounds
238 .type Camellia_DecryptBlock_Rounds,\@function,4
239 .align 16
240 .Ldec_rounds:
241 Camellia_DecryptBlock_Rounds:
242 push %rbx
243 push %rbp
244 push %r13
245 push %r14
246 push %r15
247 .Ldec_prologue:
248
249 #mov %rsi,$inp # put away arguments
250 mov %rcx,$out
251 mov %rdx,$keyend
252
253 shl \$6,%edi # process grandRounds
254 lea .LCamellia_SBOX(%rip),$Tbl
255 lea ($keyend,%rdi),$key
256
257 mov 0(%rsi),@S[0] # load plaintext
258 mov 4(%rsi),@S[1]
259 mov 8(%rsi),@S[2]
260 bswap @S[0]
261 mov 12(%rsi),@S[3]
262 bswap @S[1]
263 bswap @S[2]
264 bswap @S[3]
265
266 call _x86_64_Camellia_decrypt
267
268 bswap @S[0]
269 bswap @S[1]
270 bswap @S[2]
271 mov @S[0],0($out)
272 bswap @S[3]
273 mov @S[1],4($out)
274 mov @S[2],8($out)
275 mov @S[3],12($out)
276
277 mov 0(%rsp),%r15
278 mov 8(%rsp),%r14
279 mov 16(%rsp),%r13
280 mov 24(%rsp),%rbp
281 mov 32(%rsp),%rbx
282 lea 40(%rsp),%rsp
283 .Ldec_epilogue:
284 ret
285 .size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
286
287 .type _x86_64_Camellia_decrypt,\@abi-omnipotent
288 .align 16
289 _x86_64_Camellia_decrypt:
290 xor 0($key),@S[1]
291 xor 4($key),@S[0] # ^=key[0-3]
292 xor 8($key),@S[3]
293 xor 12($key),@S[2]
294 .align 16
295 .Ldloop:
296 mov -8($key),$t1 # prefetch key[4-5]
297 mov -4($key),$t0
298
299 ___
300 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
301 $code.=<<___;
302 lea -16*4($key),$key
303 cmp $keyend,$key
304 mov 0($key),$t3 # prefetch key[2-3]
305 mov 4($key),$t2
306 je .Lddone
307
308 and @S[0],$t0
309 or @S[3],$t3
310 rol \$1,$t0
311 xor $t3,@S[2] # s2^=s3|key[3];
312 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
313 and @S[2],$t2
314 or @S[1],$t1
315 rol \$1,$t2
316 xor $t1,@S[0] # s0^=s1|key[1];
317 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
318
319 jmp .Ldloop
320
321 .align 16
322 .Lddone:
323 xor @S[2],$t2
324 xor @S[3],$t3
325 xor @S[0],$t0
326 xor @S[1],$t1
327
328 mov $t2,@S[0] # SwapHalf
329 mov $t3,@S[1]
330 mov $t0,@S[2]
331 mov $t1,@S[3]
332
333 .byte 0xf3,0xc3 # rep ret
334 .size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
335 ___
336
337 sub _saveround {
338 my ($rnd,$key,@T)=@_;
339 my $bias=int(@T[0])?shift(@T):0;
340
341 if ($#T==3) {
342 $code.=<<___;
343 mov @T[1],`$bias+$rnd*8+0`($key)
344 mov @T[0],`$bias+$rnd*8+4`($key)
345 mov @T[3],`$bias+$rnd*8+8`($key)
346 mov @T[2],`$bias+$rnd*8+12`($key)
347 ___
348 } else {
349 $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n";
350 $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
351 }
352 }
353
354 sub _loadround {
355 my ($rnd,$key,@T)=@_;
356 my $bias=int(@T[0])?shift(@T):0;
357
358 $code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n";
359 $code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
360 }
361
362 # shld is very slow on Intel EM64T family. Even on AMD it limits
363 # instruction decode rate [because it's VectorPath] and consequently
364 # performance...
365 sub __rotl128 {
366 my ($i0,$i1,$rot)=@_;
367
368 if ($rot) {
369 $code.=<<___;
370 mov $i0,%r11
371 shld \$$rot,$i1,$i0
372 shld \$$rot,%r11,$i1
373 ___
374 }
375 }
376
377 # ... Implementing 128-bit rotate without shld gives 80% better
378 # performance EM64T, +15% on AMD64 and only ~7% degradation on
379 # Core2. This is therefore preferred.
380 sub _rotl128 {
381 my ($i0,$i1,$rot)=@_;
382
383 if ($rot) {
384 $code.=<<___;
385 mov $i0,%r11
386 shl \$$rot,$i0
387 mov $i1,%r9
388 shr \$`64-$rot`,%r9
389 shr \$`64-$rot`,%r11
390 or %r9,$i0
391 shl \$$rot,$i1
392 or %r11,$i1
393 ___
394 }
395 }
396
397 { my $step=0;
398
399 $code.=<<___;
400 .globl Camellia_Ekeygen
401 .type Camellia_Ekeygen,\@function,3
402 .align 16
403 Camellia_Ekeygen:
404 push %rbx
405 push %rbp
406 push %r13
407 push %r14
408 push %r15
409 .Lkey_prologue:
410
411 mov %rdi,$keyend # put away arguments, keyBitLength
412 mov %rdx,$out # keyTable
413
414 mov 0(%rsi),@S[0] # load 0-127 bits
415 mov 4(%rsi),@S[1]
416 mov 8(%rsi),@S[2]
417 mov 12(%rsi),@S[3]
418
419 bswap @S[0]
420 bswap @S[1]
421 bswap @S[2]
422 bswap @S[3]
423 ___
424 &_saveround (0,$out,@S); # KL<<<0
425 $code.=<<___;
426 cmp \$128,$keyend # check keyBitLength
427 je .L1st128
428
429 mov 16(%rsi),@S[0] # load 128-191 bits
430 mov 20(%rsi),@S[1]
431 cmp \$192,$keyend
432 je .L1st192
433 mov 24(%rsi),@S[2] # load 192-255 bits
434 mov 28(%rsi),@S[3]
435 jmp .L1st256
436 .L1st192:
437 mov @S[0],@S[2]
438 mov @S[1],@S[3]
439 not @S[2]
440 not @S[3]
441 .L1st256:
442 bswap @S[0]
443 bswap @S[1]
444 bswap @S[2]
445 bswap @S[3]
446 ___
447 &_saveround (4,$out,@S); # temp storage for KR!
448 $code.=<<___;
449 xor 0($out),@S[1] # KR^KL
450 xor 4($out),@S[0]
451 xor 8($out),@S[3]
452 xor 12($out),@S[2]
453
454 .L1st128:
455 lea .LCamellia_SIGMA(%rip),$key
456 lea .LCamellia_SBOX(%rip),$Tbl
457
458 mov 0($key),$t1
459 mov 4($key),$t0
460 ___
461 &Camellia_Feistel($step++);
462 &Camellia_Feistel($step++);
463 $code.=<<___;
464 xor 0($out),@S[1] # ^KL
465 xor 4($out),@S[0]
466 xor 8($out),@S[3]
467 xor 12($out),@S[2]
468 ___
469 &Camellia_Feistel($step++);
470 &Camellia_Feistel($step++);
471 $code.=<<___;
472 cmp \$128,$keyend
473 jne .L2nd256
474
475 lea 128($out),$out # size optimization
476 shl \$32,%r8 # @S[0]||
477 shl \$32,%r10 # @S[2]||
478 or %r9,%r8 # ||@S[1]
479 or %r11,%r10 # ||@S[3]
480 ___
481 &_loadround (0,$out,-128,"%rax","%rbx"); # KL
482 &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0
483 &_rotl128 ("%rax","%rbx",15);
484 &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15
485 &_rotl128 ("%r8","%r10",15);
486 &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15
487 &_rotl128 ("%r8","%r10",15); # 15+15=30
488 &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30
489 &_rotl128 ("%rax","%rbx",30); # 15+30=45
490 &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45
491 &_rotl128 ("%r8","%r10",15); # 30+15=45
492 &_saveround (12,$out,-128,"%r8"); # KA<<<45
493 &_rotl128 ("%rax","%rbx",15); # 45+15=60
494 &_saveround (13,$out,-128,"%rbx"); # KL<<<60
495 &_rotl128 ("%r8","%r10",15); # 45+15=60
496 &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60
497 &_rotl128 ("%rax","%rbx",17); # 60+17=77
498 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77
499 &_rotl128 ("%rax","%rbx",17); # 77+17=94
500 &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94
501 &_rotl128 ("%r8","%r10",34); # 60+34=94
502 &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94
503 &_rotl128 ("%rax","%rbx",17); # 94+17=111
504 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111
505 &_rotl128 ("%r8","%r10",17); # 94+17=111
506 &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111
507 $code.=<<___;
508 mov \$3,%eax
509 jmp .Ldone
510 .align 16
511 .L2nd256:
512 ___
513 &_saveround (6,$out,@S); # temp storage for KA!
514 $code.=<<___;
515 xor `4*8+0`($out),@S[1] # KA^KR
516 xor `4*8+4`($out),@S[0]
517 xor `5*8+0`($out),@S[3]
518 xor `5*8+4`($out),@S[2]
519 ___
520 &Camellia_Feistel($step++);
521 &Camellia_Feistel($step++);
522
523 &_loadround (0,$out,"%rax","%rbx"); # KL
524 &_loadround (4,$out,"%rcx","%rdx"); # KR
525 &_loadround (6,$out,"%r14","%r15"); # KA
526 $code.=<<___;
527 lea 128($out),$out # size optimization
528 shl \$32,%r8 # @S[0]||
529 shl \$32,%r10 # @S[2]||
530 or %r9,%r8 # ||@S[1]
531 or %r11,%r10 # ||@S[3]
532 ___
533 &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0
534 &_rotl128 ("%rcx","%rdx",15);
535 &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15
536 &_rotl128 ("%r14","%r15",15);
537 &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15
538 &_rotl128 ("%rcx","%rdx",15); # 15+15=30
539 &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30
540 &_rotl128 ("%r8","%r10",30);
541 &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30
542 &_rotl128 ("%rax","%rbx",45);
543 &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45
544 &_rotl128 ("%r14","%r15",30); # 15+30=45
545 &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45
546 &_rotl128 ("%rax","%rbx",15); # 45+15=60
547 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60
548 &_rotl128 ("%rcx","%rdx",30); # 30+30=60
549 &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60
550 &_rotl128 ("%r8","%r10",30); # 30+30=60
551 &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60
552 &_rotl128 ("%rax","%rbx",17); # 60+17=77
553 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77
554 &_rotl128 ("%r14","%r15",32); # 45+32=77
555 &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77
556 &_rotl128 ("%rcx","%rdx",34); # 60+34=94
557 &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94
558 &_rotl128 ("%r14","%r15",17); # 77+17=94
559 &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77
560 &_rotl128 ("%rax","%rbx",34); # 77+34=111
561 &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111
562 &_rotl128 ("%r8","%r10",51); # 60+51=111
563 &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111
564 $code.=<<___;
565 mov \$4,%eax
566 .Ldone:
567 mov 0(%rsp),%r15
568 mov 8(%rsp),%r14
569 mov 16(%rsp),%r13
570 mov 24(%rsp),%rbp
571 mov 32(%rsp),%rbx
572 lea 40(%rsp),%rsp
573 .Lkey_epilogue:
574 ret
575 .size Camellia_Ekeygen,.-Camellia_Ekeygen
576 ___
577 }
578
579 @SBOX=(
580 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
581 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
582 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
583 166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
584 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
585 223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
586 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
587 254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
588 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
589 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
590 135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
591 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
592 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
593 120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
594 114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
595 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
596
597 sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x", $i); }
598 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$ i; sprintf("0x%08x",$i); }
599 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i ; sprintf("0x%08x",$i); }
600 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i ; sprintf("0x%08x",$i); }
601
602 $code.=<<___;
603 .align 64
604 .LCamellia_SIGMA:
605 .long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
606 .long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
607 .long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
608 .long 0, 0, 0, 0
609 .LCamellia_SBOX:
610 ___
611 # tables are interleaved, remember?
612 sub data_word { $code.=".long\t".join(',',@_)."\n"; }
613 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
614 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
615
616 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
617 # size_t length, const CAMELLIA_KEY *key,
618 # unsigned char *ivp,const int enc);
619 {
620 $_key="0(%rsp)";
621 $_end="8(%rsp)"; # inp+len&~15
622 $_res="16(%rsp)"; # len&15
623 $ivec="24(%rsp)";
624 $_ivp="40(%rsp)";
625 $_rsp="48(%rsp)";
626
627 $code.=<<___;
628 .globl Camellia_cbc_encrypt
629 .type Camellia_cbc_encrypt,\@function,6
630 .align 16
631 Camellia_cbc_encrypt:
632 cmp \$0,%rdx
633 je .Lcbc_abort
634 push %rbx
635 push %rbp
636 push %r12
637 push %r13
638 push %r14
639 push %r15
640 .Lcbc_prologue:
641
642 mov %rsp,%rbp
643 sub \$64,%rsp
644 and \$-64,%rsp
645
646 # place stack frame just "above mod 1024" the key schedule,
647 # this ensures that cache associativity suffices
648 lea -64-63(%rcx),%r10
649 sub %rsp,%r10
650 neg %r10
651 and \$0x3C0,%r10
652 sub %r10,%rsp
653 #add \$8,%rsp # 8 is reserved for callee's ra
654
655 mov %rdi,$inp # inp argument
656 mov %rsi,$out # out argument
657 mov %r8,%rbx # ivp argument
658 mov %rcx,$key # key argument
659 mov 272(%rcx),${keyend}d # grandRounds
660
661 mov %r8,$_ivp
662 mov %rbp,$_rsp
663
664 .Lcbc_body:
665 lea .LCamellia_SBOX(%rip),$Tbl
666
667 mov \$32,%ecx
668 .align 4
669 .Lcbc_prefetch_sbox:
670 mov 0($Tbl),%rax
671 mov 32($Tbl),%rsi
672 mov 64($Tbl),%rdi
673 mov 96($Tbl),%r11
674 lea 128($Tbl),$Tbl
675 loop .Lcbc_prefetch_sbox
676 sub \$4096,$Tbl
677 shl \$6,$keyend
678 mov %rdx,%rcx # len argument
679 lea ($key,$keyend),$keyend
680
681 cmp \$0,%r9d # enc argument
682 je .LCBC_DECRYPT
683
684 and \$-16,%rdx
685 and \$15,%rcx # length residue
686 lea ($inp,%rdx),%rdx
687 mov $key,$_key
688 mov %rdx,$_end
689 mov %rcx,$_res
690
691 cmp $inp,%rdx
692 mov 0(%rbx),@S[0] # load IV
693 mov 4(%rbx),@S[1]
694 mov 8(%rbx),@S[2]
695 mov 12(%rbx),@S[3]
696 je .Lcbc_enc_tail
697 jmp .Lcbc_eloop
698
699 .align 16
700 .Lcbc_eloop:
701 xor 0($inp),@S[0]
702 xor 4($inp),@S[1]
703 xor 8($inp),@S[2]
704 bswap @S[0]
705 xor 12($inp),@S[3]
706 bswap @S[1]
707 bswap @S[2]
708 bswap @S[3]
709
710 call _x86_64_Camellia_encrypt
711
712 mov $_key,$key # "rewind" the key
713 bswap @S[0]
714 mov $_end,%rdx
715 bswap @S[1]
716 mov $_res,%rcx
717 bswap @S[2]
718 mov @S[0],0($out)
719 bswap @S[3]
720 mov @S[1],4($out)
721 mov @S[2],8($out)
722 lea 16($inp),$inp
723 mov @S[3],12($out)
724 cmp %rdx,$inp
725 lea 16($out),$out
726 jne .Lcbc_eloop
727
728 cmp \$0,%rcx
729 jne .Lcbc_enc_tail
730
731 mov $_ivp,$out
732 mov @S[0],0($out) # write out IV residue
733 mov @S[1],4($out)
734 mov @S[2],8($out)
735 mov @S[3],12($out)
736 jmp .Lcbc_done
737
738 .align 16
739 .Lcbc_enc_tail:
740 xor %rax,%rax
741 mov %rax,0+$ivec
742 mov %rax,8+$ivec
743 mov %rax,$_res
744
745 .Lcbc_enc_pushf:
746 pushfq
747 cld
748 mov $inp,%rsi
749 lea 8+$ivec,%rdi
750 .long 0x9066A4F3 # rep movsb
751 popfq
752 .Lcbc_enc_popf:
753
754 lea $ivec,$inp
755 lea 16+$ivec,%rax
756 mov %rax,$_end
757 jmp .Lcbc_eloop # one more time
758
759 .align 16
760 .LCBC_DECRYPT:
761 xchg $key,$keyend
762 add \$15,%rdx
763 and \$15,%rcx # length residue
764 and \$-16,%rdx
765 mov $key,$_key
766 lea ($inp,%rdx),%rdx
767 mov %rdx,$_end
768 mov %rcx,$_res
769
770 mov (%rbx),%rax # load IV
771 mov 8(%rbx),%rbx
772 jmp .Lcbc_dloop
773 .align 16
774 .Lcbc_dloop:
775 mov 0($inp),@S[0]
776 mov 4($inp),@S[1]
777 mov 8($inp),@S[2]
778 bswap @S[0]
779 mov 12($inp),@S[3]
780 bswap @S[1]
781 mov %rax,0+$ivec # save IV to temporary storage
782 bswap @S[2]
783 mov %rbx,8+$ivec
784 bswap @S[3]
785
786 call _x86_64_Camellia_decrypt
787
788 mov $_key,$key # "rewind" the key
789 mov $_end,%rdx
790 mov $_res,%rcx
791
792 bswap @S[0]
793 mov ($inp),%rax # load IV for next iteration
794 bswap @S[1]
795 mov 8($inp),%rbx
796 bswap @S[2]
797 xor 0+$ivec,@S[0]
798 bswap @S[3]
799 xor 4+$ivec,@S[1]
800 xor 8+$ivec,@S[2]
801 lea 16($inp),$inp
802 xor 12+$ivec,@S[3]
803 cmp %rdx,$inp
804 je .Lcbc_ddone
805
806 mov @S[0],0($out)
807 mov @S[1],4($out)
808 mov @S[2],8($out)
809 mov @S[3],12($out)
810
811 lea 16($out),$out
812 jmp .Lcbc_dloop
813
814 .align 16
815 .Lcbc_ddone:
816 mov $_ivp,%rdx
817 cmp \$0,%rcx
818 jne .Lcbc_dec_tail
819
820 mov @S[0],0($out)
821 mov @S[1],4($out)
822 mov @S[2],8($out)
823 mov @S[3],12($out)
824
825 mov %rax,(%rdx) # write out IV residue
826 mov %rbx,8(%rdx)
827 jmp .Lcbc_done
828 .align 16
829 .Lcbc_dec_tail:
830 mov @S[0],0+$ivec
831 mov @S[1],4+$ivec
832 mov @S[2],8+$ivec
833 mov @S[3],12+$ivec
834
835 .Lcbc_dec_pushf:
836 pushfq
837 cld
838 lea 8+$ivec,%rsi
839 lea ($out),%rdi
840 .long 0x9066A4F3 # rep movsb
841 popfq
842 .Lcbc_dec_popf:
843
844 mov %rax,(%rdx) # write out IV residue
845 mov %rbx,8(%rdx)
846 jmp .Lcbc_done
847
848 .align 16
849 .Lcbc_done:
850 mov $_rsp,%rcx
851 mov 0(%rcx),%r15
852 mov 8(%rcx),%r14
853 mov 16(%rcx),%r13
854 mov 24(%rcx),%r12
855 mov 32(%rcx),%rbp
856 mov 40(%rcx),%rbx
857 lea 48(%rcx),%rsp
858 .Lcbc_abort:
859 ret
860 .size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
861
862 .asciz "Camellia for x86_64 by <appro\@openssl.org>"
863 ___
864 }
865
866 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
867 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
868 if ($win64) {
869 $rec="%rcx";
870 $frame="%rdx";
871 $context="%r8";
872 $disp="%r9";
873
874 $code.=<<___;
875 .extern __imp_RtlVirtualUnwind
876 .type common_se_handler,\@abi-omnipotent
877 .align 16
878 common_se_handler:
879 push %rsi
880 push %rdi
881 push %rbx
882 push %rbp
883 push %r12
884 push %r13
885 push %r14
886 push %r15
887 pushfq
888 lea -64(%rsp),%rsp
889
890 mov 120($context),%rax # pull context->Rax
891 mov 248($context),%rbx # pull context->Rip
892
893 mov 8($disp),%rsi # disp->ImageBase
894 mov 56($disp),%r11 # disp->HandlerData
895
896 mov 0(%r11),%r10d # HandlerData[0]
897 lea (%rsi,%r10),%r10 # prologue label
898 cmp %r10,%rbx # context->Rip<prologue label
899 jb .Lin_prologue
900
901 mov 152($context),%rax # pull context->Rsp
902
903 mov 4(%r11),%r10d # HandlerData[1]
904 lea (%rsi,%r10),%r10 # epilogue label
905 cmp %r10,%rbx # context->Rip>=epilogue label
906 jae .Lin_prologue
907
908 lea 40(%rax),%rax
909 mov -8(%rax),%rbx
910 mov -16(%rax),%rbp
911 mov -24(%rax),%r13
912 mov -32(%rax),%r14
913 mov -40(%rax),%r15
914 mov %rbx,144($context) # restore context->Rbx
915 mov %rbp,160($context) # restore context->Rbp
916 mov %r13,224($context) # restore context->R13
917 mov %r14,232($context) # restore context->R14
918 mov %r15,240($context) # restore context->R15
919
920 .Lin_prologue:
921 mov 8(%rax),%rdi
922 mov 16(%rax),%rsi
923 mov %rax,152($context) # restore context->Rsp
924 mov %rsi,168($context) # restore context->Rsi
925 mov %rdi,176($context) # restore context->Rdi
926
927 jmp .Lcommon_seh_exit
928 .size common_se_handler,.-common_se_handler
929
930 .type cbc_se_handler,\@abi-omnipotent
931 .align 16
932 cbc_se_handler:
933 push %rsi
934 push %rdi
935 push %rbx
936 push %rbp
937 push %r12
938 push %r13
939 push %r14
940 push %r15
941 pushfq
942 lea -64(%rsp),%rsp
943
944 mov 120($context),%rax # pull context->Rax
945 mov 248($context),%rbx # pull context->Rip
946
947 lea .Lcbc_prologue(%rip),%r10
948 cmp %r10,%rbx # context->Rip<.Lcbc_prologue
949 jb .Lin_cbc_prologue
950
951 lea .Lcbc_body(%rip),%r10
952 cmp %r10,%rbx # context->Rip<.Lcbc_body
953 jb .Lin_cbc_frame_setup
954
955 mov 152($context),%rax # pull context->Rsp
956
957 lea .Lcbc_abort(%rip),%r10
958 cmp %r10,%rbx # context->Rip>=.Lcbc_abort
959 jae .Lin_cbc_prologue
960
961 # handle pushf/popf in Camellia_cbc_encrypt
962 lea .Lcbc_enc_pushf(%rip),%r10
963 cmp %r10,%rbx # context->Rip<=.Lcbc_enc_pushf
964 jbe .Lin_cbc_no_flag
965 lea 8(%rax),%rax
966 lea .Lcbc_enc_popf(%rip),%r10
967 cmp %r10,%rbx # context->Rip<.Lcbc_enc_popf
968 jb .Lin_cbc_no_flag
969 lea -8(%rax),%rax
970 lea .Lcbc_dec_pushf(%rip),%r10
971 cmp %r10,%rbx # context->Rip<=.Lcbc_dec_pushf
972 jbe .Lin_cbc_no_flag
973 lea 8(%rax),%rax
974 lea .Lcbc_dec_popf(%rip),%r10
975 cmp %r10,%rbx # context->Rip<.Lcbc_dec_popf
976 jb .Lin_cbc_no_flag
977 lea -8(%rax),%rax
978
979 .Lin_cbc_no_flag:
980 mov 48(%rax),%rax # $_rsp
981 lea 48(%rax),%rax
982
983 .Lin_cbc_frame_setup:
984 mov -8(%rax),%rbx
985 mov -16(%rax),%rbp
986 mov -24(%rax),%r12
987 mov -32(%rax),%r13
988 mov -40(%rax),%r14
989 mov -48(%rax),%r15
990 mov %rbx,144($context) # restore context->Rbx
991 mov %rbp,160($context) # restore context->Rbp
992 mov %r12,216($context) # restore context->R12
993 mov %r13,224($context) # restore context->R13
994 mov %r14,232($context) # restore context->R14
995 mov %r15,240($context) # restore context->R15
996
997 .Lin_cbc_prologue:
998 mov 8(%rax),%rdi
999 mov 16(%rax),%rsi
1000 mov %rax,152($context) # restore context->Rsp
1001 mov %rsi,168($context) # restore context->Rsi
1002 mov %rdi,176($context) # restore context->Rdi
1003
1004 .align 4
1005 .Lcommon_seh_exit:
1006
1007 mov 40($disp),%rdi # disp->ContextRecord
1008 mov $context,%rsi # context
1009 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1010 .long 0xa548f3fc # cld; rep movsq
1011
1012 mov $disp,%rsi
1013 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1014 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1015 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1016 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1017 mov 40(%rsi),%r10 # disp->ContextRecord
1018 lea 56(%rsi),%r11 # &disp->HandlerData
1019 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1020 mov %r10,32(%rsp) # arg5
1021 mov %r11,40(%rsp) # arg6
1022 mov %r12,48(%rsp) # arg7
1023 mov %rcx,56(%rsp) # arg8, (NULL)
1024 call *__imp_RtlVirtualUnwind(%rip)
1025
1026 mov \$1,%eax # ExceptionContinueSearch
1027 lea 64(%rsp),%rsp
1028 popfq
1029 pop %r15
1030 pop %r14
1031 pop %r13
1032 pop %r12
1033 pop %rbp
1034 pop %rbx
1035 pop %rdi
1036 pop %rsi
1037 ret
1038 .size cbc_se_handler,.-cbc_se_handler
1039
1040 .section .pdata
1041 .align 4
1042 .rva .LSEH_begin_Camellia_EncryptBlock_Rounds
1043 .rva .LSEH_end_Camellia_EncryptBlock_Rounds
1044 .rva .LSEH_info_Camellia_EncryptBlock_Rounds
1045
1046 .rva .LSEH_begin_Camellia_DecryptBlock_Rounds
1047 .rva .LSEH_end_Camellia_DecryptBlock_Rounds
1048 .rva .LSEH_info_Camellia_DecryptBlock_Rounds
1049
1050 .rva .LSEH_begin_Camellia_Ekeygen
1051 .rva .LSEH_end_Camellia_Ekeygen
1052 .rva .LSEH_info_Camellia_Ekeygen
1053
1054 .rva .LSEH_begin_Camellia_cbc_encrypt
1055 .rva .LSEH_end_Camellia_cbc_encrypt
1056 .rva .LSEH_info_Camellia_cbc_encrypt
1057
1058 .section .xdata
1059 .align 8
1060 .LSEH_info_Camellia_EncryptBlock_Rounds:
1061 .byte 9,0,0,0
1062 .rva common_se_handler
1063 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
1064 .LSEH_info_Camellia_DecryptBlock_Rounds:
1065 .byte 9,0,0,0
1066 .rva common_se_handler
1067 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
1068 .LSEH_info_Camellia_Ekeygen:
1069 .byte 9,0,0,0
1070 .rva common_se_handler
1071 .rva .Lkey_prologue,.Lkey_epilogue # HandlerData[]
1072 .LSEH_info_Camellia_cbc_encrypt:
1073 .byte 9,0,0,0
1074 .rva cbc_se_handler
1075 ___
1076 }
1077
1078 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1079 print $code;
1080 close STDOUT;
OLDNEW
« no previous file with comments | « openssl/crypto/camellia/asm/cmll-x86.pl ('k') | openssl/crypto/camellia/camellia.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698