OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license and patent | |
5 ; grant that can be found in the LICENSE file in the root of the source | |
6 ; tree. All contributing project authors may be found in the AUTHORS | |
7 ; file in the root of the source tree. | |
8 ; | |
9 | |
10 | |
11 %include "vpx_ports/x86_abi_support.asm" | |
12 %include "vp8_asm_enc_offsets.asm" | |
13 | |
14 | |
15 ; void vp8_regular_quantize_b_sse2 | arg | |
16 ; (BLOCK *b, | 0 | |
17 ; BLOCKD *d) | 1 | |
18 | |
19 global sym(vp8_regular_quantize_b_sse2) PRIVATE | |
20 sym(vp8_regular_quantize_b_sse2): | |
21 push rbp | |
22 mov rbp, rsp | |
23 SAVE_XMM 7 | |
24 GET_GOT rbx | |
25 | |
26 %if ABI_IS_32BIT | |
27 push rdi | |
28 push rsi | |
29 %else | |
30 %if LIBVPX_YASM_WIN64 | |
31 push rdi | |
32 push rsi | |
33 %endif | |
34 %endif | |
35 | |
36 ALIGN_STACK 16, rax | |
37 %define zrun_zbin_boost 0 ; 8 | |
38 %define abs_minus_zbin 8 ; 32 | |
39 %define temp_qcoeff 40 ; 32 | |
40 %define qcoeff 72 ; 32 | |
41 %define stack_size 104 | |
42 sub rsp, stack_size | |
43 ; end prolog | |
44 | |
45 %if ABI_IS_32BIT | |
46 mov rdi, arg(0) ; BLOCK *b | |
47 mov rsi, arg(1) ; BLOCKD *d | |
48 %else | |
49 %if LIBVPX_YASM_WIN64 | |
50 mov rdi, rcx ; BLOCK *b | |
51 mov rsi, rdx ; BLOCKD *d | |
52 %else | |
53 ;mov rdi, rdi ; BLOCK *b | |
54 ;mov rsi, rsi ; BLOCKD *d | |
55 %endif | |
56 %endif | |
57 | |
58 mov rdx, [rdi + vp8_block_coeff] ; coeff_ptr | |
59 mov rcx, [rdi + vp8_block_zbin] ; zbin_ptr | |
60 movd xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value | |
61 | |
62 ; z | |
63 movdqa xmm0, [rdx] | |
64 movdqa xmm4, [rdx + 16] | |
65 mov rdx, [rdi + vp8_block_round] ; round_ptr | |
66 | |
67 pshuflw xmm7, xmm7, 0 | |
68 punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value | |
69 | |
70 movdqa xmm1, xmm0 | |
71 movdqa xmm5, xmm4 | |
72 | |
73 ; sz | |
74 psraw xmm0, 15 | |
75 psraw xmm4, 15 | |
76 | |
77 ; (z ^ sz) | |
78 pxor xmm1, xmm0 | |
79 pxor xmm5, xmm4 | |
80 | |
81 ; x = abs(z) | |
82 psubw xmm1, xmm0 | |
83 psubw xmm5, xmm4 | |
84 | |
85 movdqa xmm2, [rcx] | |
86 movdqa xmm3, [rcx + 16] | |
87 mov rcx, [rdi + vp8_block_quant] ; quant_ptr | |
88 | |
89 ; *zbin_ptr + zbin_oq_value | |
90 paddw xmm2, xmm7 | |
91 paddw xmm3, xmm7 | |
92 | |
93 ; x - (*zbin_ptr + zbin_oq_value) | |
94 psubw xmm1, xmm2 | |
95 psubw xmm5, xmm3 | |
96 movdqa [rsp + abs_minus_zbin], xmm1 | |
97 movdqa [rsp + abs_minus_zbin + 16], xmm5 | |
98 | |
99 ; add (zbin_ptr + zbin_oq_value) back | |
100 paddw xmm1, xmm2 | |
101 paddw xmm5, xmm3 | |
102 | |
103 movdqa xmm2, [rdx] | |
104 movdqa xmm6, [rdx + 16] | |
105 | |
106 movdqa xmm3, [rcx] | |
107 movdqa xmm7, [rcx + 16] | |
108 | |
109 ; x + round | |
110 paddw xmm1, xmm2 | |
111 paddw xmm5, xmm6 | |
112 | |
113 ; y = x * quant_ptr >> 16 | |
114 pmulhw xmm3, xmm1 | |
115 pmulhw xmm7, xmm5 | |
116 | |
117 ; y += x | |
118 paddw xmm1, xmm3 | |
119 paddw xmm5, xmm7 | |
120 | |
121 movdqa [rsp + temp_qcoeff], xmm1 | |
122 movdqa [rsp + temp_qcoeff + 16], xmm5 | |
123 | |
124 pxor xmm6, xmm6 | |
125 ; zero qcoeff | |
126 movdqa [rsp + qcoeff], xmm6 | |
127 movdqa [rsp + qcoeff + 16], xmm6 | |
128 | |
129 mov rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr | |
130 mov rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr | |
131 mov [rsp + zrun_zbin_boost], rdx | |
132 | |
133 %macro ZIGZAG_LOOP 1 | |
134 ; x | |
135 movsx ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2] | |
136 | |
137 ; if (x >= zbin) | |
138 sub cx, WORD PTR[rdx] ; x - zbin | |
139 lea rdx, [rdx + 2] ; zbin_boost_ptr++ | |
140 jl .rq_zigzag_loop_%1 ; x < zbin | |
141 | |
142 movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2] | |
143 | |
144 ; downshift by quant_shift[rc] | |
145 movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc] | |
146 sar edi, cl ; also sets Z bit | |
147 je .rq_zigzag_loop_%1 ; !y | |
148 mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoef
f[rc] | |
149 mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost | |
150 .rq_zigzag_loop_%1: | |
151 %endmacro | |
152 ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c | |
153 ZIGZAG_LOOP 0 | |
154 ZIGZAG_LOOP 1 | |
155 ZIGZAG_LOOP 4 | |
156 ZIGZAG_LOOP 8 | |
157 ZIGZAG_LOOP 5 | |
158 ZIGZAG_LOOP 2 | |
159 ZIGZAG_LOOP 3 | |
160 ZIGZAG_LOOP 6 | |
161 ZIGZAG_LOOP 9 | |
162 ZIGZAG_LOOP 12 | |
163 ZIGZAG_LOOP 13 | |
164 ZIGZAG_LOOP 10 | |
165 ZIGZAG_LOOP 7 | |
166 ZIGZAG_LOOP 11 | |
167 ZIGZAG_LOOP 14 | |
168 ZIGZAG_LOOP 15 | |
169 | |
170 movdqa xmm2, [rsp + qcoeff] | |
171 movdqa xmm3, [rsp + qcoeff + 16] | |
172 | |
173 mov rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr | |
174 mov rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr | |
175 | |
176 ; y ^ sz | |
177 pxor xmm2, xmm0 | |
178 pxor xmm3, xmm4 | |
179 ; x = (y ^ sz) - sz | |
180 psubw xmm2, xmm0 | |
181 psubw xmm3, xmm4 | |
182 | |
183 ; dequant | |
184 movdqa xmm0, [rcx] | |
185 movdqa xmm1, [rcx + 16] | |
186 | |
187 mov rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr | |
188 | |
189 pmullw xmm0, xmm2 | |
190 pmullw xmm1, xmm3 | |
191 | |
192 movdqa [rcx], xmm2 ; store qcoeff | |
193 movdqa [rcx + 16], xmm3 | |
194 movdqa [rdi], xmm0 ; store dqcoeff | |
195 movdqa [rdi + 16], xmm1 | |
196 | |
197 mov rcx, [rsi + vp8_blockd_eob] | |
198 | |
199 ; select the last value (in zig_zag order) for EOB | |
200 pcmpeqw xmm2, xmm6 | |
201 pcmpeqw xmm3, xmm6 | |
202 ; ! | |
203 pcmpeqw xmm6, xmm6 | |
204 pxor xmm2, xmm6 | |
205 pxor xmm3, xmm6 | |
206 ; mask inv_zig_zag | |
207 pand xmm2, [GLOBAL(inv_zig_zag)] | |
208 pand xmm3, [GLOBAL(inv_zig_zag + 16)] | |
209 ; select the max value | |
210 pmaxsw xmm2, xmm3 | |
211 pshufd xmm3, xmm2, 00001110b | |
212 pmaxsw xmm2, xmm3 | |
213 pshuflw xmm3, xmm2, 00001110b | |
214 pmaxsw xmm2, xmm3 | |
215 pshuflw xmm3, xmm2, 00000001b | |
216 pmaxsw xmm2, xmm3 | |
217 movd eax, xmm2 | |
218 and eax, 0xff | |
219 | |
220 mov BYTE PTR [rcx], al ; store eob | |
221 | |
222 ; begin epilog | |
223 add rsp, stack_size | |
224 pop rsp | |
225 %if ABI_IS_32BIT | |
226 pop rsi | |
227 pop rdi | |
228 %else | |
229 %if LIBVPX_YASM_WIN64 | |
230 pop rsi | |
231 pop rdi | |
232 %endif | |
233 %endif | |
234 RESTORE_GOT | |
235 RESTORE_XMM | |
236 pop rbp | |
237 ret | |
238 | |
239 ; void vp8_fast_quantize_b_sse2 | arg | |
240 ; (BLOCK *b, | 0 | |
241 ; BLOCKD *d) | 1 | |
242 | |
243 global sym(vp8_fast_quantize_b_sse2) PRIVATE | |
244 sym(vp8_fast_quantize_b_sse2): | |
245 push rbp | |
246 mov rbp, rsp | |
247 GET_GOT rbx | |
248 | |
249 %if ABI_IS_32BIT | |
250 push rdi | |
251 push rsi | |
252 %else | |
253 %if LIBVPX_YASM_WIN64 | |
254 push rdi | |
255 push rsi | |
256 %else | |
257 ; these registers are used for passing arguments | |
258 %endif | |
259 %endif | |
260 | |
261 ; end prolog | |
262 | |
263 %if ABI_IS_32BIT | |
264 mov rdi, arg(0) ; BLOCK *b | |
265 mov rsi, arg(1) ; BLOCKD *d | |
266 %else | |
267 %if LIBVPX_YASM_WIN64 | |
268 mov rdi, rcx ; BLOCK *b | |
269 mov rsi, rdx ; BLOCKD *d | |
270 %else | |
271 ;mov rdi, rdi ; BLOCK *b | |
272 ;mov rsi, rsi ; BLOCKD *d | |
273 %endif | |
274 %endif | |
275 | |
276 mov rax, [rdi + vp8_block_coeff] | |
277 mov rcx, [rdi + vp8_block_round] | |
278 mov rdx, [rdi + vp8_block_quant_fast] | |
279 | |
280 ; z = coeff | |
281 movdqa xmm0, [rax] | |
282 movdqa xmm4, [rax + 16] | |
283 | |
284 ; dup z so we can save sz | |
285 movdqa xmm1, xmm0 | |
286 movdqa xmm5, xmm4 | |
287 | |
288 ; sz = z >> 15 | |
289 psraw xmm0, 15 | |
290 psraw xmm4, 15 | |
291 | |
292 ; x = abs(z) = (z ^ sz) - sz | |
293 pxor xmm1, xmm0 | |
294 pxor xmm5, xmm4 | |
295 psubw xmm1, xmm0 | |
296 psubw xmm5, xmm4 | |
297 | |
298 ; x += round | |
299 paddw xmm1, [rcx] | |
300 paddw xmm5, [rcx + 16] | |
301 | |
302 mov rax, [rsi + vp8_blockd_qcoeff] | |
303 mov rcx, [rsi + vp8_blockd_dequant] | |
304 mov rdi, [rsi + vp8_blockd_dqcoeff] | |
305 | |
306 ; y = x * quant >> 16 | |
307 pmulhw xmm1, [rdx] | |
308 pmulhw xmm5, [rdx + 16] | |
309 | |
310 ; x = (y ^ sz) - sz | |
311 pxor xmm1, xmm0 | |
312 pxor xmm5, xmm4 | |
313 psubw xmm1, xmm0 | |
314 psubw xmm5, xmm4 | |
315 | |
316 ; qcoeff = x | |
317 movdqa [rax], xmm1 | |
318 movdqa [rax + 16], xmm5 | |
319 | |
320 ; x * dequant | |
321 movdqa xmm2, xmm1 | |
322 movdqa xmm3, xmm5 | |
323 pmullw xmm2, [rcx] | |
324 pmullw xmm3, [rcx + 16] | |
325 | |
326 ; dqcoeff = x * dequant | |
327 movdqa [rdi], xmm2 | |
328 movdqa [rdi + 16], xmm3 | |
329 | |
330 pxor xmm4, xmm4 ;clear all bits | |
331 pcmpeqw xmm1, xmm4 | |
332 pcmpeqw xmm5, xmm4 | |
333 | |
334 pcmpeqw xmm4, xmm4 ;set all bits | |
335 pxor xmm1, xmm4 | |
336 pxor xmm5, xmm4 | |
337 | |
338 pand xmm1, [GLOBAL(inv_zig_zag)] | |
339 pand xmm5, [GLOBAL(inv_zig_zag + 16)] | |
340 | |
341 pmaxsw xmm1, xmm5 | |
342 | |
343 mov rcx, [rsi + vp8_blockd_eob] | |
344 | |
345 ; now down to 8 | |
346 pshufd xmm5, xmm1, 00001110b | |
347 | |
348 pmaxsw xmm1, xmm5 | |
349 | |
350 ; only 4 left | |
351 pshuflw xmm5, xmm1, 00001110b | |
352 | |
353 pmaxsw xmm1, xmm5 | |
354 | |
355 ; okay, just 2! | |
356 pshuflw xmm5, xmm1, 00000001b | |
357 | |
358 pmaxsw xmm1, xmm5 | |
359 | |
360 movd eax, xmm1 | |
361 and eax, 0xff | |
362 | |
363 mov BYTE PTR [rcx], al ; store eob | |
364 | |
365 ; begin epilog | |
366 %if ABI_IS_32BIT | |
367 pop rsi | |
368 pop rdi | |
369 %else | |
370 %if LIBVPX_YASM_WIN64 | |
371 pop rsi | |
372 pop rdi | |
373 %endif | |
374 %endif | |
375 | |
376 RESTORE_GOT | |
377 pop rbp | |
378 ret | |
379 | |
380 SECTION_RODATA | |
381 align 16 | |
382 inv_zig_zag: | |
383 dw 0x0001, 0x0002, 0x0006, 0x0007 | |
384 dw 0x0003, 0x0005, 0x0008, 0x000d | |
385 dw 0x0004, 0x0009, 0x000c, 0x000e | |
386 dw 0x000a, 0x000b, 0x000f, 0x0010 | |
OLD | NEW |