Index: nss/lib/freebl/intel-aes-x64-masm.asm |
diff --git a/nss/lib/freebl/intel-aes-x86-masm.asm b/nss/lib/freebl/intel-aes-x64-masm.asm |
similarity index 62% |
copy from nss/lib/freebl/intel-aes-x86-masm.asm |
copy to nss/lib/freebl/intel-aes-x64-masm.asm |
index 7d805e7660f15d20f89911424dc83dbb7d906dca..ef5c76ba28370882583003116b9aeeb3505e256d 100644 |
--- a/nss/lib/freebl/intel-aes-x86-masm.asm |
+++ b/nss/lib/freebl/intel-aes-x64-masm.asm |
@@ -10,9 +10,6 @@ |
; Please send feedback directly to crypto.feedback.alias@intel.com |
-.MODEL FLAT, C |
-.XMM |
- |
.DATA |
ALIGN 16 |
Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh |
@@ -23,74 +20,81 @@ Lcon2 dd 1bh,1bh,1bh,1bh |
.CODE |
-ctx textequ <ecx> |
-output textequ <edx> |
-input textequ <eax> |
-inputLen textequ <edi> |
+ctx textequ <rcx> |
+output textequ <rdx> |
+input textequ <r8> |
+inputLen textequ <r9d> |
aes_rnd MACRO i |
- movdqu xmm7, [i*16 + ctx] |
- aesenc xmm0, xmm7 |
- aesenc xmm1, xmm7 |
- aesenc xmm2, xmm7 |
- aesenc xmm3, xmm7 |
- aesenc xmm4, xmm7 |
- aesenc xmm5, xmm7 |
- aesenc xmm6, xmm7 |
+ movdqu xmm8, [i*16 + ctx] |
+ aesenc xmm0, xmm8 |
+ aesenc xmm1, xmm8 |
+ aesenc xmm2, xmm8 |
+ aesenc xmm3, xmm8 |
+ aesenc xmm4, xmm8 |
+ aesenc xmm5, xmm8 |
+ aesenc xmm6, xmm8 |
+ aesenc xmm7, xmm8 |
ENDM |
aes_last_rnd MACRO i |
- movdqu xmm7, [i*16 + ctx] |
- aesenclast xmm0, xmm7 |
- aesenclast xmm1, xmm7 |
- aesenclast xmm2, xmm7 |
- aesenclast xmm3, xmm7 |
- aesenclast xmm4, xmm7 |
- aesenclast xmm5, xmm7 |
- aesenclast xmm6, xmm7 |
+ movdqu xmm8, [i*16 + ctx] |
+ aesenclast xmm0, xmm8 |
+ aesenclast xmm1, xmm8 |
+ aesenclast xmm2, xmm8 |
+ aesenclast xmm3, xmm8 |
+ aesenclast xmm4, xmm8 |
+ aesenclast xmm5, xmm8 |
+ aesenclast xmm6, xmm8 |
+ aesenclast xmm7, xmm8 |
ENDM |
aes_dec_rnd MACRO i |
- movdqu xmm7, [i*16 + ctx] |
- aesdec xmm0, xmm7 |
- aesdec xmm1, xmm7 |
- aesdec xmm2, xmm7 |
- aesdec xmm3, xmm7 |
- aesdec xmm4, xmm7 |
- aesdec xmm5, xmm7 |
- aesdec xmm6, xmm7 |
+ movdqu xmm8, [i*16 + ctx] |
+ aesdec xmm0, xmm8 |
+ aesdec xmm1, xmm8 |
+ aesdec xmm2, xmm8 |
+ aesdec xmm3, xmm8 |
+ aesdec xmm4, xmm8 |
+ aesdec xmm5, xmm8 |
+ aesdec xmm6, xmm8 |
+ aesdec xmm7, xmm8 |
ENDM |
aes_dec_last_rnd MACRO i |
- movdqu xmm7, [i*16 + ctx] |
- aesdeclast xmm0, xmm7 |
- aesdeclast xmm1, xmm7 |
- aesdeclast xmm2, xmm7 |
- aesdeclast xmm3, xmm7 |
- aesdeclast xmm4, xmm7 |
- aesdeclast xmm5, xmm7 |
- aesdeclast xmm6, xmm7 |
+ movdqu xmm8, [i*16 + ctx] |
+ aesdeclast xmm0, xmm8 |
+ aesdeclast xmm1, xmm8 |
+ aesdeclast xmm2, xmm8 |
+ aesdeclast xmm3, xmm8 |
+ aesdeclast xmm4, xmm8 |
+ aesdeclast xmm5, xmm8 |
+ aesdeclast xmm6, xmm8 |
+ aesdeclast xmm7, xmm8 |
ENDM |
gen_aes_ecb_func MACRO enc, rnds |
-LOCAL loop7 |
+LOCAL loop8 |
LOCAL loop1 |
LOCAL bail |
- push inputLen |
+ xor inputLen, inputLen |
+ mov input, [rsp + 1*8 + 8*4] |
+ mov inputLen, [rsp + 1*8 + 8*5] |
+ |
+ sub rsp, 3*16 |
- mov ctx, [esp + 2*4 + 0*4] |
- mov output, [esp + 2*4 + 1*4] |
- mov input, [esp + 2*4 + 4*4] |
- mov inputLen, [esp + 2*4 + 5*4] |
+ movdqu [rsp + 0*16], xmm6 |
+ movdqu [rsp + 1*16], xmm7 |
+ movdqu [rsp + 2*16], xmm8 |
- lea ctx, [44+ctx] |
+ lea ctx, [48+ctx] |
-loop7: |
- cmp inputLen, 7*16 |
+loop8: |
+ cmp inputLen, 8*16 |
jb loop1 |
movdqu xmm0, [0*16 + input] |
@@ -100,15 +104,17 @@ loop7: |
movdqu xmm4, [4*16 + input] |
movdqu xmm5, [5*16 + input] |
movdqu xmm6, [6*16 + input] |
- |
- movdqu xmm7, [0*16 + ctx] |
- pxor xmm0, xmm7 |
- pxor xmm1, xmm7 |
- pxor xmm2, xmm7 |
- pxor xmm3, xmm7 |
- pxor xmm4, xmm7 |
- pxor xmm5, xmm7 |
- pxor xmm6, xmm7 |
+ movdqu xmm7, [7*16 + input] |
+ |
+ movdqu xmm8, [0*16 + ctx] |
+ pxor xmm0, xmm8 |
+ pxor xmm1, xmm8 |
+ pxor xmm2, xmm8 |
+ pxor xmm3, xmm8 |
+ pxor xmm4, xmm8 |
+ pxor xmm5, xmm8 |
+ pxor xmm6, xmm8 |
+ pxor xmm7, xmm8 |
IF enc eq 1 |
rnd textequ <aes_rnd> |
@@ -136,11 +142,12 @@ ENDIF |
movdqu [4*16 + output], xmm4 |
movdqu [5*16 + output], xmm5 |
movdqu [6*16 + output], xmm6 |
+ movdqu [7*16 + output], xmm7 |
- lea input, [7*16 + input] |
- lea output, [7*16 + output] |
- sub inputLen, 7*16 |
- jmp loop7 |
+ lea input, [8*16 + input] |
+ lea output, [8*16 + output] |
+ sub inputLen, 8*16 |
+ jmp loop8 |
loop1: |
cmp inputLen, 1*16 |
@@ -167,54 +174,46 @@ loop1: |
jmp loop1 |
bail: |
- xor eax, eax |
- pop inputLen |
- ret |
+ xor rax, rax |
+ movdqu xmm6, [rsp + 0*16] |
+ movdqu xmm7, [rsp + 1*16] |
+ movdqu xmm8, [rsp + 2*16] |
+ add rsp, 3*16 |
+ ret |
ENDM |
-ALIGN 16 |
intel_aes_encrypt_ecb_128 PROC |
gen_aes_ecb_func 1, 10 |
intel_aes_encrypt_ecb_128 ENDP |
-ALIGN 16 |
intel_aes_encrypt_ecb_192 PROC |
gen_aes_ecb_func 1, 12 |
intel_aes_encrypt_ecb_192 ENDP |
-ALIGN 16 |
intel_aes_encrypt_ecb_256 PROC |
gen_aes_ecb_func 1, 14 |
intel_aes_encrypt_ecb_256 ENDP |
-ALIGN 16 |
intel_aes_decrypt_ecb_128 PROC |
gen_aes_ecb_func 0, 10 |
intel_aes_decrypt_ecb_128 ENDP |
-ALIGN 16 |
intel_aes_decrypt_ecb_192 PROC |
gen_aes_ecb_func 0, 12 |
intel_aes_decrypt_ecb_192 ENDP |
-ALIGN 16 |
intel_aes_decrypt_ecb_256 PROC |
gen_aes_ecb_func 0, 14 |
intel_aes_decrypt_ecb_256 ENDP |
-KEY textequ <ecx> |
-KS textequ <edx> |
-ITR textequ <eax> |
+KEY textequ <rcx> |
+KS textequ <rdx> |
+ITR textequ <r8> |
-ALIGN 16 |
intel_aes_encrypt_init_128 PROC |
- mov KEY, [esp + 1*4 + 0*4] |
- mov KS, [esp + 1*4 + 1*4] |
- |
- |
movdqu xmm1, [KEY] |
movdqu [KS], xmm1 |
movdqa xmm2, xmm1 |
@@ -280,12 +279,8 @@ Lenc_128_ks_loop: |
intel_aes_encrypt_init_128 ENDP |
-ALIGN 16 |
intel_aes_decrypt_init_128 PROC |
- mov KEY, [esp + 1*4 + 0*4] |
- mov KS, [esp + 1*4 + 1*4] |
- |
push KS |
push KEY |
@@ -320,16 +315,15 @@ intel_aes_decrypt_init_128 PROC |
intel_aes_decrypt_init_128 ENDP |
-ALIGN 16 |
intel_aes_encrypt_init_192 PROC |
- mov KEY, [esp + 1*4 + 0*4] |
- mov KS, [esp + 1*4 + 1*4] |
+ sub rsp, 16*2 |
+ movdqu [16*0 + rsp], xmm6 |
+ movdqu [16*1 + rsp], xmm7 |
- pxor xmm3, xmm3 |
movdqu xmm1, [KEY] |
- pinsrd xmm3, DWORD PTR [16 + KEY], 0 |
- pinsrd xmm3, DWORD PTR [20 + KEY], 1 |
+ mov ITR, [16 + KEY] |
+ movd xmm3, ITR |
movdqu [KS], xmm1 |
movdqa xmm5, xmm3 |
@@ -396,14 +390,14 @@ Lenc_192_ks_loop: |
jnz Lenc_192_ks_loop |
movdqu [16 + KS], xmm5 |
-ret |
+ |
+ movdqu xmm7, [16*1 + rsp] |
+ movdqu xmm6, [16*0 + rsp] |
+ add rsp, 16*2 |
+ ret |
intel_aes_encrypt_init_192 ENDP |
-ALIGN 16 |
intel_aes_decrypt_init_192 PROC |
- mov KEY, [esp + 1*4 + 0*4] |
- mov KS, [esp + 1*4 + 1*4] |
- |
push KS |
push KEY |
@@ -437,11 +431,12 @@ intel_aes_decrypt_init_192 PROC |
ret |
intel_aes_decrypt_init_192 ENDP |
-ALIGN 16 |
+ |
intel_aes_encrypt_init_256 PROC |
+ sub rsp, 16*2 |
+ movdqu [16*0 + rsp], xmm6 |
+ movdqu [16*1 + rsp], xmm7 |
- mov KEY, [esp + 1*4 + 0*4] |
- mov KS, [esp + 1*4 + 1*4] |
movdqu xmm1, [16*0 + KEY] |
movdqu xmm3, [16*1 + KEY] |
@@ -502,14 +497,15 @@ Lenc_256_ks_loop: |
pxor xmm1, xmm2 |
movdqu [16*2 + KS], xmm1 |
+ movdqu xmm7, [16*1 + rsp] |
+ movdqu xmm6, [16*0 + rsp] |
+ add rsp, 16*2 |
ret |
+ |
intel_aes_encrypt_init_256 ENDP |
-ALIGN 16 |
-intel_aes_decrypt_init_256 PROC |
- mov KEY, [esp + 1*4 + 0*4] |
- mov KS, [esp + 1*4 + 1*4] |
+intel_aes_decrypt_init_256 PROC |
push KS |
push KEY |
@@ -550,14 +546,16 @@ gen_aes_cbc_enc_func MACRO rnds |
LOCAL loop1 |
LOCAL bail |
- push inputLen |
+ mov input, [rsp + 1*8 + 8*4] |
+ mov inputLen, [rsp + 1*8 + 8*5] |
- mov ctx, [esp + 2*4 + 0*4] |
- mov output, [esp + 2*4 + 1*4] |
- mov input, [esp + 2*4 + 4*4] |
- mov inputLen, [esp + 2*4 + 5*4] |
+ sub rsp, 3*16 |
- lea ctx, [44+ctx] |
+ movdqu [rsp + 0*16], xmm6 |
+ movdqu [rsp + 1*16], xmm7 |
+ movdqu [rsp + 2*16], xmm8 |
+ |
+ lea ctx, [48+ctx] |
movdqu xmm0, [-32+ctx] |
@@ -566,6 +564,7 @@ LOCAL bail |
movdqu xmm4, [2*16 + ctx] |
movdqu xmm5, [3*16 + ctx] |
movdqu xmm6, [4*16 + ctx] |
+ movdqu xmm7, [5*16 + ctx] |
loop1: |
cmp inputLen, 1*16 |
@@ -579,15 +578,16 @@ loop1: |
aesenc xmm0, xmm4 |
aesenc xmm0, xmm5 |
aesenc xmm0, xmm6 |
+ aesenc xmm0, xmm7 |
- i = 5 |
+ i = 6 |
WHILE i LT rnds |
- movdqu xmm7, [i*16 + ctx] |
- aesenc xmm0, xmm7 |
+ movdqu xmm8, [i*16 + ctx] |
+ aesenc xmm0, xmm8 |
i = i+1 |
ENDM |
- movdqu xmm7, [rnds*16 + ctx] |
- aesenclast xmm0, xmm7 |
+ movdqu xmm8, [rnds*16 + ctx] |
+ aesenclast xmm0, xmm8 |
movdqu [output], xmm0 |
@@ -599,30 +599,36 @@ loop1: |
bail: |
movdqu [-32+ctx], xmm0 |
- xor eax, eax |
- pop inputLen |
+ xor rax, rax |
+ |
+ movdqu xmm6, [rsp + 0*16] |
+ movdqu xmm7, [rsp + 1*16] |
+ movdqu xmm8, [rsp + 2*16] |
+ add rsp, 3*16 |
ret |
ENDM |
gen_aes_cbc_dec_func MACRO rnds |
-LOCAL loop7 |
+LOCAL loop8 |
LOCAL loop1 |
LOCAL dec1 |
LOCAL bail |
- push inputLen |
+ mov input, [rsp + 1*8 + 8*4] |
+ mov inputLen, [rsp + 1*8 + 8*5] |
- mov ctx, [esp + 2*4 + 0*4] |
- mov output, [esp + 2*4 + 1*4] |
- mov input, [esp + 2*4 + 4*4] |
- mov inputLen, [esp + 2*4 + 5*4] |
+ sub rsp, 3*16 |
- lea ctx, [44+ctx] |
+ movdqu [rsp + 0*16], xmm6 |
+ movdqu [rsp + 1*16], xmm7 |
+ movdqu [rsp + 2*16], xmm8 |
-loop7: |
- cmp inputLen, 7*16 |
+ lea ctx, [48+ctx] |
+ |
+loop8: |
+ cmp inputLen, 8*16 |
jb dec1 |
movdqu xmm0, [0*16 + input] |
@@ -632,15 +638,17 @@ loop7: |
movdqu xmm4, [4*16 + input] |
movdqu xmm5, [5*16 + input] |
movdqu xmm6, [6*16 + input] |
- |
- movdqu xmm7, [0*16 + ctx] |
- pxor xmm0, xmm7 |
- pxor xmm1, xmm7 |
- pxor xmm2, xmm7 |
- pxor xmm3, xmm7 |
- pxor xmm4, xmm7 |
- pxor xmm5, xmm7 |
- pxor xmm6, xmm7 |
+ movdqu xmm7, [7*16 + input] |
+ |
+ movdqu xmm8, [0*16 + ctx] |
+ pxor xmm0, xmm8 |
+ pxor xmm1, xmm8 |
+ pxor xmm2, xmm8 |
+ pxor xmm3, xmm8 |
+ pxor xmm4, xmm8 |
+ pxor xmm5, xmm8 |
+ pxor xmm6, xmm8 |
+ pxor xmm7, xmm8 |
i = 1 |
WHILE i LT rnds |
@@ -649,21 +657,23 @@ loop7: |
ENDM |
aes_dec_last_rnd rnds |
- movdqu xmm7, [-32 + ctx] |
- pxor xmm0, xmm7 |
- movdqu xmm7, [0*16 + input] |
- pxor xmm1, xmm7 |
- movdqu xmm7, [1*16 + input] |
- pxor xmm2, xmm7 |
- movdqu xmm7, [2*16 + input] |
- pxor xmm3, xmm7 |
- movdqu xmm7, [3*16 + input] |
- pxor xmm4, xmm7 |
- movdqu xmm7, [4*16 + input] |
- pxor xmm5, xmm7 |
- movdqu xmm7, [5*16 + input] |
- pxor xmm6, xmm7 |
- movdqu xmm7, [6*16 + input] |
+ movdqu xmm8, [-32 + ctx] |
+ pxor xmm0, xmm8 |
+ movdqu xmm8, [0*16 + input] |
+ pxor xmm1, xmm8 |
+ movdqu xmm8, [1*16 + input] |
+ pxor xmm2, xmm8 |
+ movdqu xmm8, [2*16 + input] |
+ pxor xmm3, xmm8 |
+ movdqu xmm8, [3*16 + input] |
+ pxor xmm4, xmm8 |
+ movdqu xmm8, [4*16 + input] |
+ pxor xmm5, xmm8 |
+ movdqu xmm8, [5*16 + input] |
+ pxor xmm6, xmm8 |
+ movdqu xmm8, [6*16 + input] |
+ pxor xmm7, xmm8 |
+ movdqu xmm8, [7*16 + input] |
movdqu [0*16 + output], xmm0 |
movdqu [1*16 + output], xmm1 |
@@ -672,12 +682,13 @@ loop7: |
movdqu [4*16 + output], xmm4 |
movdqu [5*16 + output], xmm5 |
movdqu [6*16 + output], xmm6 |
- movdqu [-32 + ctx], xmm7 |
+ movdqu [7*16 + output], xmm7 |
+ movdqu [-32 + ctx], xmm8 |
- lea input, [7*16 + input] |
- lea output, [7*16 + output] |
- sub inputLen, 7*16 |
- jmp loop7 |
+ lea input, [8*16 + input] |
+ lea output, [8*16 + output] |
+ sub inputLen, 8*16 |
+ jmp loop8 |
dec1: |
movdqu xmm3, [-32 + ctx] |
@@ -711,143 +722,152 @@ loop1: |
bail: |
movdqu [-32 + ctx], xmm3 |
- xor eax, eax |
- pop inputLen |
+ xor rax, rax |
+ |
+ movdqu xmm6, [rsp + 0*16] |
+ movdqu xmm7, [rsp + 1*16] |
+ movdqu xmm8, [rsp + 2*16] |
+ add rsp, 3*16 |
ret |
ENDM |
-ALIGN 16 |
intel_aes_encrypt_cbc_128 PROC |
gen_aes_cbc_enc_func 10 |
intel_aes_encrypt_cbc_128 ENDP |
-ALIGN 16 |
intel_aes_encrypt_cbc_192 PROC |
gen_aes_cbc_enc_func 12 |
intel_aes_encrypt_cbc_192 ENDP |
-ALIGN 16 |
intel_aes_encrypt_cbc_256 PROC |
gen_aes_cbc_enc_func 14 |
intel_aes_encrypt_cbc_256 ENDP |
-ALIGN 16 |
intel_aes_decrypt_cbc_128 PROC |
gen_aes_cbc_dec_func 10 |
intel_aes_decrypt_cbc_128 ENDP |
-ALIGN 16 |
intel_aes_decrypt_cbc_192 PROC |
gen_aes_cbc_dec_func 12 |
intel_aes_decrypt_cbc_192 ENDP |
-ALIGN 16 |
intel_aes_decrypt_cbc_256 PROC |
gen_aes_cbc_dec_func 14 |
intel_aes_decrypt_cbc_256 ENDP |
-ctrCtx textequ <esi> |
-CTR textequ <ebx> |
+ctrCtx textequ <r10> |
+CTR textequ <r11d> |
+CTRSave textequ <eax> |
gen_aes_ctr_func MACRO rnds |
-LOCAL loop7 |
+LOCAL loop8 |
LOCAL loop1 |
LOCAL enc1 |
LOCAL bail |
- push inputLen |
- push ctrCtx |
- push CTR |
- push ebp |
+ mov input, [rsp + 8*1 + 4*8] |
+ mov inputLen, [rsp + 8*1 + 5*8] |
+ |
+ mov ctrCtx, ctx |
+ mov ctx, [8+ctrCtx] |
+ lea ctx, [48+ctx] |
- mov ctrCtx, [esp + 4*5 + 0*4] |
- mov output, [esp + 4*5 + 1*4] |
- mov input, [esp + 4*5 + 4*4] |
- mov inputLen, [esp + 4*5 + 5*4] |
+ sub rsp, 3*16 |
+ movdqu [rsp + 0*16], xmm6 |
+ movdqu [rsp + 1*16], xmm7 |
+ movdqu [rsp + 2*16], xmm8 |
- mov ctx, [4+ctrCtx] |
- lea ctx, [44+ctx] |
- mov ebp, esp |
- sub esp, 7*16 |
- and esp, -16 |
+ push rbp |
+ mov rbp, rsp |
+ sub rsp, 8*16 |
+ and rsp, -16 |
- movdqu xmm0, [8+ctrCtx] |
- mov ctrCtx, [ctrCtx + 8 + 3*4] |
- bswap ctrCtx |
+ |
+ movdqu xmm0, [16+ctrCtx] |
+ mov CTRSave, DWORD PTR [ctrCtx + 16 + 3*4] |
+ bswap CTRSave |
movdqu xmm1, [ctx + 0*16] |
pxor xmm0, xmm1 |
- movdqa [esp + 0*16], xmm0 |
- movdqa [esp + 1*16], xmm0 |
- movdqa [esp + 2*16], xmm0 |
- movdqa [esp + 3*16], xmm0 |
- movdqa [esp + 4*16], xmm0 |
- movdqa [esp + 5*16], xmm0 |
- movdqa [esp + 6*16], xmm0 |
+ movdqa [rsp + 0*16], xmm0 |
+ movdqa [rsp + 1*16], xmm0 |
+ movdqa [rsp + 2*16], xmm0 |
+ movdqa [rsp + 3*16], xmm0 |
+ movdqa [rsp + 4*16], xmm0 |
+ movdqa [rsp + 5*16], xmm0 |
+ movdqa [rsp + 6*16], xmm0 |
+ movdqa [rsp + 7*16], xmm0 |
+ |
+ inc CTRSave |
+ mov CTR, CTRSave |
+ bswap CTR |
+ xor CTR, DWORD PTR [ctx + 3*4] |
+ mov DWORD PTR [rsp + 1*16 + 3*4], CTR |
- inc ctrCtx |
- mov CTR, ctrCtx |
+ inc CTRSave |
+ mov CTR, CTRSave |
bswap CTR |
- xor CTR, [ctx + 3*4] |
- mov [esp + 1*16 + 3*4], CTR |
+ xor CTR, DWORD PTR [ctx + 3*4] |
+ mov DWORD PTR [rsp + 2*16 + 3*4], CTR |
- inc ctrCtx |
- mov CTR, ctrCtx |
+ inc CTRSave |
+ mov CTR, CTRSave |
bswap CTR |
- xor CTR, [ctx + 3*4] |
- mov [esp + 2*16 + 3*4], CTR |
+ xor CTR, DWORD PTR [ctx + 3*4] |
+ mov DWORD PTR [rsp + 3*16 + 3*4], CTR |
- inc ctrCtx |
- mov CTR, ctrCtx |
+ inc CTRSave |
+ mov CTR, CTRSave |
bswap CTR |
- xor CTR, [ctx + 3*4] |
- mov [esp + 3*16 + 3*4], CTR |
+ xor CTR, DWORD PTR [ctx + 3*4] |
+ mov DWORD PTR [rsp + 4*16 + 3*4], CTR |
- inc ctrCtx |
- mov CTR, ctrCtx |
+ inc CTRSave |
+ mov CTR, CTRSave |
bswap CTR |
- xor CTR, [ctx + 3*4] |
- mov [esp + 4*16 + 3*4], CTR |
+ xor CTR, DWORD PTR [ctx + 3*4] |
+ mov DWORD PTR [rsp + 5*16 + 3*4], CTR |
- inc ctrCtx |
- mov CTR, ctrCtx |
+ inc CTRSave |
+ mov CTR, CTRSave |
bswap CTR |
- xor CTR, [ctx + 3*4] |
- mov [esp + 5*16 + 3*4], CTR |
+ xor CTR, DWORD PTR [ctx + 3*4] |
+ mov DWORD PTR [rsp + 6*16 + 3*4], CTR |
- inc ctrCtx |
- mov CTR, ctrCtx |
+ inc CTRSave |
+ mov CTR, CTRSave |
bswap CTR |
- xor CTR, [ctx + 3*4] |
- mov [esp + 6*16 + 3*4], CTR |
+ xor CTR, DWORD PTR [ctx + 3*4] |
+ mov DWORD PTR [rsp + 7*16 + 3*4], CTR |
-loop7: |
- cmp inputLen, 7*16 |
+loop8: |
+ cmp inputLen, 8*16 |
jb loop1 |
- movdqu xmm0, [0*16 + esp] |
- movdqu xmm1, [1*16 + esp] |
- movdqu xmm2, [2*16 + esp] |
- movdqu xmm3, [3*16 + esp] |
- movdqu xmm4, [4*16 + esp] |
- movdqu xmm5, [5*16 + esp] |
- movdqu xmm6, [6*16 + esp] |
+ movdqu xmm0, [0*16 + rsp] |
+ movdqu xmm1, [1*16 + rsp] |
+ movdqu xmm2, [2*16 + rsp] |
+ movdqu xmm3, [3*16 + rsp] |
+ movdqu xmm4, [4*16 + rsp] |
+ movdqu xmm5, [5*16 + rsp] |
+ movdqu xmm6, [6*16 + rsp] |
+ movdqu xmm7, [7*16 + rsp] |
i = 1 |
- WHILE i LE 7 |
+ WHILE i LE 8 |
aes_rnd i |
- inc ctrCtx |
- mov CTR, ctrCtx |
+ inc CTRSave |
+ mov CTR, CTRSave |
bswap CTR |
- xor CTR, [ctx + 3*4] |
- mov [esp + (i-1)*16 + 3*4], CTR |
+ xor CTR, DWORD PTR [ctx + 3*4] |
+ mov DWORD PTR [rsp + (i-1)*16 + 3*4], CTR |
i = i+1 |
ENDM |
@@ -857,20 +877,22 @@ loop7: |
ENDM |
aes_last_rnd rnds |
- movdqu xmm7, [0*16 + input] |
- pxor xmm0, xmm7 |
- movdqu xmm7, [1*16 + input] |
- pxor xmm1, xmm7 |
- movdqu xmm7, [2*16 + input] |
- pxor xmm2, xmm7 |
- movdqu xmm7, [3*16 + input] |
- pxor xmm3, xmm7 |
- movdqu xmm7, [4*16 + input] |
- pxor xmm4, xmm7 |
- movdqu xmm7, [5*16 + input] |
- pxor xmm5, xmm7 |
- movdqu xmm7, [6*16 + input] |
- pxor xmm6, xmm7 |
+ movdqu xmm8, [0*16 + input] |
+ pxor xmm0, xmm8 |
+ movdqu xmm8, [1*16 + input] |
+ pxor xmm1, xmm8 |
+ movdqu xmm8, [2*16 + input] |
+ pxor xmm2, xmm8 |
+ movdqu xmm8, [3*16 + input] |
+ pxor xmm3, xmm8 |
+ movdqu xmm8, [4*16 + input] |
+ pxor xmm4, xmm8 |
+ movdqu xmm8, [5*16 + input] |
+ pxor xmm5, xmm8 |
+ movdqu xmm8, [6*16 + input] |
+ pxor xmm6, xmm8 |
+ movdqu xmm8, [7*16 + input] |
+ pxor xmm7, xmm8 |
movdqu [0*16 + output], xmm0 |
movdqu [1*16 + output], xmm1 |
@@ -879,19 +901,20 @@ loop7: |
movdqu [4*16 + output], xmm4 |
movdqu [5*16 + output], xmm5 |
movdqu [6*16 + output], xmm6 |
+ movdqu [7*16 + output], xmm7 |
- lea input, [7*16 + input] |
- lea output, [7*16 + output] |
- sub inputLen, 7*16 |
- jmp loop7 |
+ lea input, [8*16 + input] |
+ lea output, [8*16 + output] |
+ sub inputLen, 8*16 |
+ jmp loop8 |
loop1: |
cmp inputLen, 1*16 |
jb bail |
- movdqu xmm0, [esp] |
- add esp, 16 |
+ movdqu xmm0, [rsp] |
+ add rsp, 16 |
i = 1 |
WHILE i LT rnds |
@@ -913,34 +936,33 @@ loop1: |
bail: |
- mov ctrCtx, [ebp + 4*5 + 0*4] |
- movdqu xmm0, [esp] |
+ movdqu xmm0, [rsp] |
movdqu xmm1, [ctx + 0*16] |
pxor xmm0, xmm1 |
- movdqu [8+ctrCtx], xmm0 |
+ movdqu [16+ctrCtx], xmm0 |
+ |
+ |
+ xor rax, rax |
+ mov rsp, rbp |
+ pop rbp |
+ movdqu xmm6, [rsp + 0*16] |
+ movdqu xmm7, [rsp + 1*16] |
+ movdqu xmm8, [rsp + 2*16] |
+ add rsp, 3*16 |
- xor eax, eax |
- mov esp, ebp |
- pop ebp |
- pop CTR |
- pop ctrCtx |
- pop inputLen |
ret |
ENDM |
-ALIGN 16 |
intel_aes_encrypt_ctr_128 PROC |
gen_aes_ctr_func 10 |
intel_aes_encrypt_ctr_128 ENDP |
-ALIGN 16 |
intel_aes_encrypt_ctr_192 PROC |
gen_aes_ctr_func 12 |
intel_aes_encrypt_ctr_192 ENDP |
-ALIGN 16 |
intel_aes_encrypt_ctr_256 PROC |
gen_aes_ctr_func 14 |
intel_aes_encrypt_ctr_256 ENDP |