Index: simd/jdmrgss2-64.asm |
=================================================================== |
--- simd/jdmrgss2-64.asm (revision 144411) |
+++ simd/jdmrgss2-64.asm (working copy) |
@@ -1,7 +1,7 @@ |
; |
; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2) |
; |
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB |
; Copyright 2009 D. R. Commander |
; |
; Based on |
@@ -12,7 +12,7 @@ |
; This file should be assembled with NASM (Netwide Assembler), |
; can *not* be assembled with Microsoft's MASM or any compatible |
; assembler (including Borland's Turbo Assembler). |
-; NASM is available from http://nasm.sourceforge.net/ for |
+; NASM is available from http://nasm.sourceforge.net/ or |
; http://sourceforge.net/project/showfiles.php?group_id=6208 |
; |
; [TAB8] |
@@ -252,17 +252,13 @@ |
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF |
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr |
jmp short .out0 |
.out1: ; --(unaligned)----------------- |
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's) |
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA |
- add rdi, byte SIZEOF_XMMWORD ; outptr |
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD |
- add rdi, byte SIZEOF_XMMWORD ; outptr |
- maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF |
- add rdi, byte SIZEOF_XMMWORD ; outptr |
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF |
.out0: |
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr |
sub rcx, byte SIZEOF_XMMWORD |
jz near .endcolumn |
@@ -275,26 +271,23 @@ |
jmp near .columnloop |
.column_st32: |
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's) |
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE |
cmp rcx, byte 2*SIZEOF_XMMWORD |
jb short .column_st16 |
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA |
- add rdi, byte SIZEOF_XMMWORD ; outptr |
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD |
- add rdi, byte SIZEOF_XMMWORD ; outptr |
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr |
movdqa xmmA,xmmF |
sub rcx, byte 2*SIZEOF_XMMWORD |
jmp short .column_st15 |
.column_st16: |
cmp rcx, byte SIZEOF_XMMWORD |
jb short .column_st15 |
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA |
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
add rdi, byte SIZEOF_XMMWORD ; outptr |
movdqa xmmA,xmmD |
sub rcx, byte SIZEOF_XMMWORD |
.column_st15: |
-%ifdef STRICT_MEMORY_ACCESS |
; Store the lower 8 bytes of xmmA to the output when it has enough |
; space. |
cmp rcx, byte SIZEOF_MMWORD |
@@ -328,47 +321,6 @@ |
test rcx, rcx |
jz short .endcolumn |
mov BYTE [rdi], al |
-%else |
- mov rax,rcx |
- xor rcx, byte 0x0F |
- shl rcx, 2 |
- movd xmmB,ecx |
- psrlq xmmH,4 |
- pcmpeqb xmmE,xmmE |
- psrlq xmmH,xmmB |
- psrlq xmmE,xmmB |
- punpcklbw xmmE,xmmH |
- ; ---------------- |
- mov rcx,rdi |
- and rcx, byte SIZEOF_XMMWORD-1 |
- jz short .adj0 |
- add rax,rcx |
- cmp rax, byte SIZEOF_XMMWORD |
- ja short .adj0 |
- and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary |
- shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx |
- movdqa xmmG,xmmA |
- movdqa xmmC,xmmE |
- pslldq xmmA, SIZEOF_XMMWORD/2 |
- pslldq xmmE, SIZEOF_XMMWORD/2 |
- movd xmmD,ecx |
- sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT |
- jb short .adj1 |
- movd xmmF,ecx |
- psllq xmmA,xmmF |
- psllq xmmE,xmmF |
- jmp short .adj0 |
-.adj1: neg rcx |
- movd xmmF,ecx |
- psrlq xmmA,xmmF |
- psrlq xmmE,xmmF |
- psllq xmmG,xmmD |
- psllq xmmC,xmmD |
- por xmmA,xmmG |
- por xmmE,xmmC |
-.adj0: ; ---------------- |
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA |
-%endif ; STRICT_MEMORY_ACCESS ; --------------- |
%else ; RGB_PIXELSIZE == 4 ; ----------- |
@@ -413,19 +365,14 @@ |
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC |
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH |
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr |
jmp short .out0 |
.out1: ; --(unaligned)----------------- |
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's) |
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA |
- add rdi, byte SIZEOF_XMMWORD ; outptr |
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD |
- add rdi, byte SIZEOF_XMMWORD ; outptr |
- maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC |
- add rdi, byte SIZEOF_XMMWORD ; outptr |
- maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH |
- add rdi, byte SIZEOF_XMMWORD ; outptr |
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC |
+ movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH |
.out0: |
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr |
sub rcx, byte SIZEOF_XMMWORD |
jz near .endcolumn |
@@ -438,25 +385,22 @@ |
jmp near .columnloop |
.column_st32: |
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's) |
cmp rcx, byte SIZEOF_XMMWORD/2 |
jb short .column_st16 |
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA |
- add rdi, byte SIZEOF_XMMWORD ; outptr |
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD |
- add rdi, byte SIZEOF_XMMWORD ; outptr |
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr |
movdqa xmmA,xmmC |
movdqa xmmD,xmmH |
sub rcx, byte SIZEOF_XMMWORD/2 |
.column_st16: |
cmp rcx, byte SIZEOF_XMMWORD/4 |
jb short .column_st15 |
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA |
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
add rdi, byte SIZEOF_XMMWORD ; outptr |
movdqa xmmA,xmmD |
sub rcx, byte SIZEOF_XMMWORD/4 |
.column_st15: |
-%ifdef STRICT_MEMORY_ACCESS |
; Store two pixels (8 bytes) of xmmA to the output when it has enough |
; space. |
cmp rcx, byte SIZEOF_XMMWORD/8 |
@@ -471,47 +415,6 @@ |
test rcx, rcx |
jz short .endcolumn |
movd DWORD [rdi], xmmA |
-%else |
- cmp rcx, byte SIZEOF_XMMWORD/16 |
- jb near .endcolumn |
- mov rax,rcx |
- xor rcx, byte 0x03 |
- inc rcx |
- shl rcx, 4 |
- movd xmmF,ecx |
- psrlq xmmE,xmmF |
- punpcklbw xmmE,xmmE |
- ; ---------------- |
- mov rcx,rdi |
- and rcx, byte SIZEOF_XMMWORD-1 |
- jz short .adj0 |
- lea rax, [rcx+rax*4] ; RGB_PIXELSIZE |
- cmp rax, byte SIZEOF_XMMWORD |
- ja short .adj0 |
- and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary |
- shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx |
- movdqa xmmB,xmmA |
- movdqa xmmG,xmmE |
- pslldq xmmA, SIZEOF_XMMWORD/2 |
- pslldq xmmE, SIZEOF_XMMWORD/2 |
- movd xmmC,ecx |
- sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT |
- jb short .adj1 |
- movd xmmH,ecx |
- psllq xmmA,xmmH |
- psllq xmmE,xmmH |
- jmp short .adj0 |
-.adj1: neg rcx |
- movd xmmH,ecx |
- psrlq xmmA,xmmH |
- psrlq xmmE,xmmH |
- psllq xmmB,xmmC |
- psllq xmmG,xmmC |
- por xmmA,xmmB |
- por xmmE,xmmG |
-.adj0: ; ---------------- |
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA |
-%endif ; STRICT_MEMORY_ACCESS ; --------------- |
%endif ; RGB_PIXELSIZE ; --------------- |