| Index: simd/jdmrgss2-64.asm
|
| ===================================================================
|
| --- simd/jdmrgss2-64.asm (revision 144411)
|
| +++ simd/jdmrgss2-64.asm (working copy)
|
| @@ -1,7 +1,7 @@
|
| ;
|
| ; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
|
| ;
|
| -; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
| +; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
| ; Copyright 2009 D. R. Commander
|
| ;
|
| ; Based on
|
| @@ -12,7 +12,7 @@
|
| ; This file should be assembled with NASM (Netwide Assembler),
|
| ; can *not* be assembled with Microsoft's MASM or any compatible
|
| ; assembler (including Borland's Turbo Assembler).
|
| -; NASM is available from http://nasm.sourceforge.net/ for
|
| +; NASM is available from http://nasm.sourceforge.net/ or
|
| ; http://sourceforge.net/project/showfiles.php?group_id=6208
|
| ;
|
| ; [TAB8]
|
| @@ -252,17 +252,13 @@
|
| movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
| movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
| movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
| - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
| jmp short .out0
|
| .out1: ; --(unaligned)-----------------
|
| - pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
| - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
| - add rdi, byte SIZEOF_XMMWORD ; outptr
|
| - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
| - add rdi, byte SIZEOF_XMMWORD ; outptr
|
| - maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
|
| - add rdi, byte SIZEOF_XMMWORD ; outptr
|
| + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
| + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
| + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
| .out0:
|
| + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
| sub rcx, byte SIZEOF_XMMWORD
|
| jz near .endcolumn
|
|
|
| @@ -275,26 +271,23 @@
|
| jmp near .columnloop
|
|
|
| .column_st32:
|
| - pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
| lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
| cmp rcx, byte 2*SIZEOF_XMMWORD
|
| jb short .column_st16
|
| - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
| - add rdi, byte SIZEOF_XMMWORD ; outptr
|
| - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
| - add rdi, byte SIZEOF_XMMWORD ; outptr
|
| + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
| + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
| + add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
| movdqa xmmA,xmmF
|
| sub rcx, byte 2*SIZEOF_XMMWORD
|
| jmp short .column_st15
|
| .column_st16:
|
| cmp rcx, byte SIZEOF_XMMWORD
|
| jb short .column_st15
|
| - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
| + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
| add rdi, byte SIZEOF_XMMWORD ; outptr
|
| movdqa xmmA,xmmD
|
| sub rcx, byte SIZEOF_XMMWORD
|
| .column_st15:
|
| -%ifdef STRICT_MEMORY_ACCESS
|
| ; Store the lower 8 bytes of xmmA to the output when it has enough
|
| ; space.
|
| cmp rcx, byte SIZEOF_MMWORD
|
| @@ -328,47 +321,6 @@
|
| test rcx, rcx
|
| jz short .endcolumn
|
| mov BYTE [rdi], al
|
| -%else
|
| - mov rax,rcx
|
| - xor rcx, byte 0x0F
|
| - shl rcx, 2
|
| - movd xmmB,ecx
|
| - psrlq xmmH,4
|
| - pcmpeqb xmmE,xmmE
|
| - psrlq xmmH,xmmB
|
| - psrlq xmmE,xmmB
|
| - punpcklbw xmmE,xmmH
|
| - ; ----------------
|
| - mov rcx,rdi
|
| - and rcx, byte SIZEOF_XMMWORD-1
|
| - jz short .adj0
|
| - add rax,rcx
|
| - cmp rax, byte SIZEOF_XMMWORD
|
| - ja short .adj0
|
| - and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
|
| - shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
|
| - movdqa xmmG,xmmA
|
| - movdqa xmmC,xmmE
|
| - pslldq xmmA, SIZEOF_XMMWORD/2
|
| - pslldq xmmE, SIZEOF_XMMWORD/2
|
| - movd xmmD,ecx
|
| - sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
|
| - jb short .adj1
|
| - movd xmmF,ecx
|
| - psllq xmmA,xmmF
|
| - psllq xmmE,xmmF
|
| - jmp short .adj0
|
| -.adj1: neg rcx
|
| - movd xmmF,ecx
|
| - psrlq xmmA,xmmF
|
| - psrlq xmmE,xmmF
|
| - psllq xmmG,xmmD
|
| - psllq xmmC,xmmD
|
| - por xmmA,xmmG
|
| - por xmmE,xmmC
|
| -.adj0: ; ----------------
|
| - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
| -%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
|
|
| %else ; RGB_PIXELSIZE == 4 ; -----------
|
|
|
| @@ -413,19 +365,14 @@
|
| movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
| movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
| movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
| - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
| jmp short .out0
|
| .out1: ; --(unaligned)-----------------
|
| - pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
| - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
| - add rdi, byte SIZEOF_XMMWORD ; outptr
|
| - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
| - add rdi, byte SIZEOF_XMMWORD ; outptr
|
| - maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
|
| - add rdi, byte SIZEOF_XMMWORD ; outptr
|
| - maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
|
| - add rdi, byte SIZEOF_XMMWORD ; outptr
|
| + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
| + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
| + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
| + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
| .out0:
|
| + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
| sub rcx, byte SIZEOF_XMMWORD
|
| jz near .endcolumn
|
|
|
| @@ -438,25 +385,22 @@
|
| jmp near .columnloop
|
|
|
| .column_st32:
|
| - pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
| cmp rcx, byte SIZEOF_XMMWORD/2
|
| jb short .column_st16
|
| - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
| - add rdi, byte SIZEOF_XMMWORD ; outptr
|
| - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
| - add rdi, byte SIZEOF_XMMWORD ; outptr
|
| + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
| + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
| + add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
| movdqa xmmA,xmmC
|
| movdqa xmmD,xmmH
|
| sub rcx, byte SIZEOF_XMMWORD/2
|
| .column_st16:
|
| cmp rcx, byte SIZEOF_XMMWORD/4
|
| jb short .column_st15
|
| - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
| + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
| add rdi, byte SIZEOF_XMMWORD ; outptr
|
| movdqa xmmA,xmmD
|
| sub rcx, byte SIZEOF_XMMWORD/4
|
| .column_st15:
|
| -%ifdef STRICT_MEMORY_ACCESS
|
| ; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
| ; space.
|
| cmp rcx, byte SIZEOF_XMMWORD/8
|
| @@ -471,47 +415,6 @@
|
| test rcx, rcx
|
| jz short .endcolumn
|
| movd DWORD [rdi], xmmA
|
| -%else
|
| - cmp rcx, byte SIZEOF_XMMWORD/16
|
| - jb near .endcolumn
|
| - mov rax,rcx
|
| - xor rcx, byte 0x03
|
| - inc rcx
|
| - shl rcx, 4
|
| - movd xmmF,ecx
|
| - psrlq xmmE,xmmF
|
| - punpcklbw xmmE,xmmE
|
| - ; ----------------
|
| - mov rcx,rdi
|
| - and rcx, byte SIZEOF_XMMWORD-1
|
| - jz short .adj0
|
| - lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
|
| - cmp rax, byte SIZEOF_XMMWORD
|
| - ja short .adj0
|
| - and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
|
| - shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
|
| - movdqa xmmB,xmmA
|
| - movdqa xmmG,xmmE
|
| - pslldq xmmA, SIZEOF_XMMWORD/2
|
| - pslldq xmmE, SIZEOF_XMMWORD/2
|
| - movd xmmC,ecx
|
| - sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
|
| - jb short .adj1
|
| - movd xmmH,ecx
|
| - psllq xmmA,xmmH
|
| - psllq xmmE,xmmH
|
| - jmp short .adj0
|
| -.adj1: neg rcx
|
| - movd xmmH,ecx
|
| - psrlq xmmA,xmmH
|
| - psrlq xmmE,xmmH
|
| - psllq xmmB,xmmC
|
| - psllq xmmG,xmmC
|
| - por xmmA,xmmB
|
| - por xmmE,xmmG
|
| -.adj0: ; ----------------
|
| - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
| -%endif ; STRICT_MEMORY_ACCESS ; ---------------
|
|
|
| %endif ; RGB_PIXELSIZE ; ---------------
|
|
|
|
|