| OLD | NEW |
| (Empty) |
| 1 ; | |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
| 3 ; | |
| 4 ; Use of this source code is governed by a BSD-style license and patent | |
| 5 ; grant that can be found in the LICENSE file in the root of the source | |
| 6 ; tree. All contributing project authors may be found in the AUTHORS | |
| 7 ; file in the root of the source tree. | |
| 8 ; | |
| 9 | |
| 10 | |
| 11 %include "vpx_ports/x86_abi_support.asm" | |
| 12 %include "vp8_asm_enc_offsets.asm" | |
| 13 | |
| 14 | |
| 15 ; void vp8_regular_quantize_b_sse2 | arg | |
| 16 ; (BLOCK *b, | 0 | |
| 17 ; BLOCKD *d) | 1 | |
| 18 | |
| 19 global sym(vp8_regular_quantize_b_sse2) PRIVATE | |
| 20 sym(vp8_regular_quantize_b_sse2): | |
| 21 push rbp | |
| 22 mov rbp, rsp | |
| 23 SAVE_XMM 7 | |
| 24 GET_GOT rbx | |
| 25 | |
| 26 %if ABI_IS_32BIT | |
| 27 push rdi | |
| 28 push rsi | |
| 29 %else | |
| 30 %if LIBVPX_YASM_WIN64 | |
| 31 push rdi | |
| 32 push rsi | |
| 33 %endif | |
| 34 %endif | |
| 35 | |
| 36 ALIGN_STACK 16, rax | |
| 37 %define zrun_zbin_boost 0 ; 8 | |
| 38 %define abs_minus_zbin 8 ; 32 | |
| 39 %define temp_qcoeff 40 ; 32 | |
| 40 %define qcoeff 72 ; 32 | |
| 41 %define stack_size 104 | |
| 42 sub rsp, stack_size | |
| 43 ; end prolog | |
| 44 | |
| 45 %if ABI_IS_32BIT | |
| 46 mov rdi, arg(0) ; BLOCK *b | |
| 47 mov rsi, arg(1) ; BLOCKD *d | |
| 48 %else | |
| 49 %if LIBVPX_YASM_WIN64 | |
| 50 mov rdi, rcx ; BLOCK *b | |
| 51 mov rsi, rdx ; BLOCKD *d | |
| 52 %else | |
| 53 ;mov rdi, rdi ; BLOCK *b | |
| 54 ;mov rsi, rsi ; BLOCKD *d | |
| 55 %endif | |
| 56 %endif | |
| 57 | |
| 58 mov rdx, [rdi + vp8_block_coeff] ; coeff_ptr | |
| 59 mov rcx, [rdi + vp8_block_zbin] ; zbin_ptr | |
| 60 movd xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value | |
| 61 | |
| 62 ; z | |
| 63 movdqa xmm0, [rdx] | |
| 64 movdqa xmm4, [rdx + 16] | |
| 65 mov rdx, [rdi + vp8_block_round] ; round_ptr | |
| 66 | |
| 67 pshuflw xmm7, xmm7, 0 | |
| 68 punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value | |
| 69 | |
| 70 movdqa xmm1, xmm0 | |
| 71 movdqa xmm5, xmm4 | |
| 72 | |
| 73 ; sz | |
| 74 psraw xmm0, 15 | |
| 75 psraw xmm4, 15 | |
| 76 | |
| 77 ; (z ^ sz) | |
| 78 pxor xmm1, xmm0 | |
| 79 pxor xmm5, xmm4 | |
| 80 | |
| 81 ; x = abs(z) | |
| 82 psubw xmm1, xmm0 | |
| 83 psubw xmm5, xmm4 | |
| 84 | |
| 85 movdqa xmm2, [rcx] | |
| 86 movdqa xmm3, [rcx + 16] | |
| 87 mov rcx, [rdi + vp8_block_quant] ; quant_ptr | |
| 88 | |
| 89 ; *zbin_ptr + zbin_oq_value | |
| 90 paddw xmm2, xmm7 | |
| 91 paddw xmm3, xmm7 | |
| 92 | |
| 93 ; x - (*zbin_ptr + zbin_oq_value) | |
| 94 psubw xmm1, xmm2 | |
| 95 psubw xmm5, xmm3 | |
| 96 movdqa [rsp + abs_minus_zbin], xmm1 | |
| 97 movdqa [rsp + abs_minus_zbin + 16], xmm5 | |
| 98 | |
| 99 ; add (zbin_ptr + zbin_oq_value) back | |
| 100 paddw xmm1, xmm2 | |
| 101 paddw xmm5, xmm3 | |
| 102 | |
| 103 movdqa xmm2, [rdx] | |
| 104 movdqa xmm6, [rdx + 16] | |
| 105 | |
| 106 movdqa xmm3, [rcx] | |
| 107 movdqa xmm7, [rcx + 16] | |
| 108 | |
| 109 ; x + round | |
| 110 paddw xmm1, xmm2 | |
| 111 paddw xmm5, xmm6 | |
| 112 | |
| 113 ; y = x * quant_ptr >> 16 | |
| 114 pmulhw xmm3, xmm1 | |
| 115 pmulhw xmm7, xmm5 | |
| 116 | |
| 117 ; y += x | |
| 118 paddw xmm1, xmm3 | |
| 119 paddw xmm5, xmm7 | |
| 120 | |
| 121 movdqa [rsp + temp_qcoeff], xmm1 | |
| 122 movdqa [rsp + temp_qcoeff + 16], xmm5 | |
| 123 | |
| 124 pxor xmm6, xmm6 | |
| 125 ; zero qcoeff | |
| 126 movdqa [rsp + qcoeff], xmm6 | |
| 127 movdqa [rsp + qcoeff + 16], xmm6 | |
| 128 | |
| 129 mov rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr | |
| 130 mov rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr | |
| 131 mov [rsp + zrun_zbin_boost], rdx | |
| 132 | |
| 133 %macro ZIGZAG_LOOP 1 | |
| 134 ; x | |
| 135 movsx ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2] | |
| 136 | |
| 137 ; if (x >= zbin) | |
| 138 sub cx, WORD PTR[rdx] ; x - zbin | |
| 139 lea rdx, [rdx + 2] ; zbin_boost_ptr++ | |
| 140 jl .rq_zigzag_loop_%1 ; x < zbin | |
| 141 | |
| 142 movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2] | |
| 143 | |
| 144 ; downshift by quant_shift[rc] | |
| 145 movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc] | |
| 146 sar edi, cl ; also sets Z bit | |
| 147 je .rq_zigzag_loop_%1 ; !y | |
| 148 mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoef
f[rc] | |
| 149 mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost | |
| 150 .rq_zigzag_loop_%1: | |
| 151 %endmacro | |
| 152 ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c | |
| 153 ZIGZAG_LOOP 0 | |
| 154 ZIGZAG_LOOP 1 | |
| 155 ZIGZAG_LOOP 4 | |
| 156 ZIGZAG_LOOP 8 | |
| 157 ZIGZAG_LOOP 5 | |
| 158 ZIGZAG_LOOP 2 | |
| 159 ZIGZAG_LOOP 3 | |
| 160 ZIGZAG_LOOP 6 | |
| 161 ZIGZAG_LOOP 9 | |
| 162 ZIGZAG_LOOP 12 | |
| 163 ZIGZAG_LOOP 13 | |
| 164 ZIGZAG_LOOP 10 | |
| 165 ZIGZAG_LOOP 7 | |
| 166 ZIGZAG_LOOP 11 | |
| 167 ZIGZAG_LOOP 14 | |
| 168 ZIGZAG_LOOP 15 | |
| 169 | |
| 170 movdqa xmm2, [rsp + qcoeff] | |
| 171 movdqa xmm3, [rsp + qcoeff + 16] | |
| 172 | |
| 173 mov rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr | |
| 174 mov rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr | |
| 175 | |
| 176 ; y ^ sz | |
| 177 pxor xmm2, xmm0 | |
| 178 pxor xmm3, xmm4 | |
| 179 ; x = (y ^ sz) - sz | |
| 180 psubw xmm2, xmm0 | |
| 181 psubw xmm3, xmm4 | |
| 182 | |
| 183 ; dequant | |
| 184 movdqa xmm0, [rcx] | |
| 185 movdqa xmm1, [rcx + 16] | |
| 186 | |
| 187 mov rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr | |
| 188 | |
| 189 pmullw xmm0, xmm2 | |
| 190 pmullw xmm1, xmm3 | |
| 191 | |
| 192 movdqa [rcx], xmm2 ; store qcoeff | |
| 193 movdqa [rcx + 16], xmm3 | |
| 194 movdqa [rdi], xmm0 ; store dqcoeff | |
| 195 movdqa [rdi + 16], xmm1 | |
| 196 | |
| 197 mov rcx, [rsi + vp8_blockd_eob] | |
| 198 | |
| 199 ; select the last value (in zig_zag order) for EOB | |
| 200 pcmpeqw xmm2, xmm6 | |
| 201 pcmpeqw xmm3, xmm6 | |
| 202 ; ! | |
| 203 pcmpeqw xmm6, xmm6 | |
| 204 pxor xmm2, xmm6 | |
| 205 pxor xmm3, xmm6 | |
| 206 ; mask inv_zig_zag | |
| 207 pand xmm2, [GLOBAL(inv_zig_zag)] | |
| 208 pand xmm3, [GLOBAL(inv_zig_zag + 16)] | |
| 209 ; select the max value | |
| 210 pmaxsw xmm2, xmm3 | |
| 211 pshufd xmm3, xmm2, 00001110b | |
| 212 pmaxsw xmm2, xmm3 | |
| 213 pshuflw xmm3, xmm2, 00001110b | |
| 214 pmaxsw xmm2, xmm3 | |
| 215 pshuflw xmm3, xmm2, 00000001b | |
| 216 pmaxsw xmm2, xmm3 | |
| 217 movd eax, xmm2 | |
| 218 and eax, 0xff | |
| 219 | |
| 220 mov BYTE PTR [rcx], al ; store eob | |
| 221 | |
| 222 ; begin epilog | |
| 223 add rsp, stack_size | |
| 224 pop rsp | |
| 225 %if ABI_IS_32BIT | |
| 226 pop rsi | |
| 227 pop rdi | |
| 228 %else | |
| 229 %if LIBVPX_YASM_WIN64 | |
| 230 pop rsi | |
| 231 pop rdi | |
| 232 %endif | |
| 233 %endif | |
| 234 RESTORE_GOT | |
| 235 RESTORE_XMM | |
| 236 pop rbp | |
| 237 ret | |
| 238 | |
| 239 ; void vp8_fast_quantize_b_sse2 | arg | |
| 240 ; (BLOCK *b, | 0 | |
| 241 ; BLOCKD *d) | 1 | |
| 242 | |
| 243 global sym(vp8_fast_quantize_b_sse2) PRIVATE | |
| 244 sym(vp8_fast_quantize_b_sse2): | |
| 245 push rbp | |
| 246 mov rbp, rsp | |
| 247 GET_GOT rbx | |
| 248 | |
| 249 %if ABI_IS_32BIT | |
| 250 push rdi | |
| 251 push rsi | |
| 252 %else | |
| 253 %if LIBVPX_YASM_WIN64 | |
| 254 push rdi | |
| 255 push rsi | |
| 256 %else | |
| 257 ; these registers are used for passing arguments | |
| 258 %endif | |
| 259 %endif | |
| 260 | |
| 261 ; end prolog | |
| 262 | |
| 263 %if ABI_IS_32BIT | |
| 264 mov rdi, arg(0) ; BLOCK *b | |
| 265 mov rsi, arg(1) ; BLOCKD *d | |
| 266 %else | |
| 267 %if LIBVPX_YASM_WIN64 | |
| 268 mov rdi, rcx ; BLOCK *b | |
| 269 mov rsi, rdx ; BLOCKD *d | |
| 270 %else | |
| 271 ;mov rdi, rdi ; BLOCK *b | |
| 272 ;mov rsi, rsi ; BLOCKD *d | |
| 273 %endif | |
| 274 %endif | |
| 275 | |
| 276 mov rax, [rdi + vp8_block_coeff] | |
| 277 mov rcx, [rdi + vp8_block_round] | |
| 278 mov rdx, [rdi + vp8_block_quant_fast] | |
| 279 | |
| 280 ; z = coeff | |
| 281 movdqa xmm0, [rax] | |
| 282 movdqa xmm4, [rax + 16] | |
| 283 | |
| 284 ; dup z so we can save sz | |
| 285 movdqa xmm1, xmm0 | |
| 286 movdqa xmm5, xmm4 | |
| 287 | |
| 288 ; sz = z >> 15 | |
| 289 psraw xmm0, 15 | |
| 290 psraw xmm4, 15 | |
| 291 | |
| 292 ; x = abs(z) = (z ^ sz) - sz | |
| 293 pxor xmm1, xmm0 | |
| 294 pxor xmm5, xmm4 | |
| 295 psubw xmm1, xmm0 | |
| 296 psubw xmm5, xmm4 | |
| 297 | |
| 298 ; x += round | |
| 299 paddw xmm1, [rcx] | |
| 300 paddw xmm5, [rcx + 16] | |
| 301 | |
| 302 mov rax, [rsi + vp8_blockd_qcoeff] | |
| 303 mov rcx, [rsi + vp8_blockd_dequant] | |
| 304 mov rdi, [rsi + vp8_blockd_dqcoeff] | |
| 305 | |
| 306 ; y = x * quant >> 16 | |
| 307 pmulhw xmm1, [rdx] | |
| 308 pmulhw xmm5, [rdx + 16] | |
| 309 | |
| 310 ; x = (y ^ sz) - sz | |
| 311 pxor xmm1, xmm0 | |
| 312 pxor xmm5, xmm4 | |
| 313 psubw xmm1, xmm0 | |
| 314 psubw xmm5, xmm4 | |
| 315 | |
| 316 ; qcoeff = x | |
| 317 movdqa [rax], xmm1 | |
| 318 movdqa [rax + 16], xmm5 | |
| 319 | |
| 320 ; x * dequant | |
| 321 movdqa xmm2, xmm1 | |
| 322 movdqa xmm3, xmm5 | |
| 323 pmullw xmm2, [rcx] | |
| 324 pmullw xmm3, [rcx + 16] | |
| 325 | |
| 326 ; dqcoeff = x * dequant | |
| 327 movdqa [rdi], xmm2 | |
| 328 movdqa [rdi + 16], xmm3 | |
| 329 | |
| 330 pxor xmm4, xmm4 ;clear all bits | |
| 331 pcmpeqw xmm1, xmm4 | |
| 332 pcmpeqw xmm5, xmm4 | |
| 333 | |
| 334 pcmpeqw xmm4, xmm4 ;set all bits | |
| 335 pxor xmm1, xmm4 | |
| 336 pxor xmm5, xmm4 | |
| 337 | |
| 338 pand xmm1, [GLOBAL(inv_zig_zag)] | |
| 339 pand xmm5, [GLOBAL(inv_zig_zag + 16)] | |
| 340 | |
| 341 pmaxsw xmm1, xmm5 | |
| 342 | |
| 343 mov rcx, [rsi + vp8_blockd_eob] | |
| 344 | |
| 345 ; now down to 8 | |
| 346 pshufd xmm5, xmm1, 00001110b | |
| 347 | |
| 348 pmaxsw xmm1, xmm5 | |
| 349 | |
| 350 ; only 4 left | |
| 351 pshuflw xmm5, xmm1, 00001110b | |
| 352 | |
| 353 pmaxsw xmm1, xmm5 | |
| 354 | |
| 355 ; okay, just 2! | |
| 356 pshuflw xmm5, xmm1, 00000001b | |
| 357 | |
| 358 pmaxsw xmm1, xmm5 | |
| 359 | |
| 360 movd eax, xmm1 | |
| 361 and eax, 0xff | |
| 362 | |
| 363 mov BYTE PTR [rcx], al ; store eob | |
| 364 | |
| 365 ; begin epilog | |
| 366 %if ABI_IS_32BIT | |
| 367 pop rsi | |
| 368 pop rdi | |
| 369 %else | |
| 370 %if LIBVPX_YASM_WIN64 | |
| 371 pop rsi | |
| 372 pop rdi | |
| 373 %endif | |
| 374 %endif | |
| 375 | |
| 376 RESTORE_GOT | |
| 377 pop rbp | |
| 378 ret | |
| 379 | |
| 380 SECTION_RODATA | |
| 381 align 16 | |
| 382 inv_zig_zag: | |
| 383 dw 0x0001, 0x0002, 0x0006, 0x0007 | |
| 384 dw 0x0003, 0x0005, 0x0008, 0x000d | |
| 385 dw 0x0004, 0x0009, 0x000c, 0x000e | |
| 386 dw 0x000a, 0x000b, 0x000f, 0x0010 | |
| OLD | NEW |