Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(426)

Side by Side Diff: source/libvpx/vp8/encoder/x86/quantize_sse2.asm

Issue 13042014: Description: (Closed) Base URL: https://src.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: libvpx: Pull from upstream Created 7 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license and patent
5 ; grant that can be found in the LICENSE file in the root of the source
6 ; tree. All contributing project authors may be found in the AUTHORS
7 ; file in the root of the source tree.
8 ;
9
10
11 %include "vpx_ports/x86_abi_support.asm"
12 %include "vp8_asm_enc_offsets.asm"
13
14
15 ; void vp8_regular_quantize_b_sse2 | arg
16 ; (BLOCK *b, | 0
17 ; BLOCKD *d) | 1
18
19 global sym(vp8_regular_quantize_b_sse2) PRIVATE
20 sym(vp8_regular_quantize_b_sse2):
21 push rbp
22 mov rbp, rsp
23 SAVE_XMM 7
24 GET_GOT rbx
25
26 %if ABI_IS_32BIT
27 push rdi
28 push rsi
29 %else
30 %if LIBVPX_YASM_WIN64
31 push rdi
32 push rsi
33 %endif
34 %endif
35
36 ALIGN_STACK 16, rax
37 %define zrun_zbin_boost 0 ; 8
38 %define abs_minus_zbin 8 ; 32
39 %define temp_qcoeff 40 ; 32
40 %define qcoeff 72 ; 32
41 %define stack_size 104
42 sub rsp, stack_size
43 ; end prolog
44
45 %if ABI_IS_32BIT
46 mov rdi, arg(0) ; BLOCK *b
47 mov rsi, arg(1) ; BLOCKD *d
48 %else
49 %if LIBVPX_YASM_WIN64
50 mov rdi, rcx ; BLOCK *b
51 mov rsi, rdx ; BLOCKD *d
52 %else
53 ;mov rdi, rdi ; BLOCK *b
54 ;mov rsi, rsi ; BLOCKD *d
55 %endif
56 %endif
57
58 mov rdx, [rdi + vp8_block_coeff] ; coeff_ptr
59 mov rcx, [rdi + vp8_block_zbin] ; zbin_ptr
60 movd xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value
61
62 ; z
63 movdqa xmm0, [rdx]
64 movdqa xmm4, [rdx + 16]
65 mov rdx, [rdi + vp8_block_round] ; round_ptr
66
67 pshuflw xmm7, xmm7, 0
68 punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value
69
70 movdqa xmm1, xmm0
71 movdqa xmm5, xmm4
72
73 ; sz
74 psraw xmm0, 15
75 psraw xmm4, 15
76
77 ; (z ^ sz)
78 pxor xmm1, xmm0
79 pxor xmm5, xmm4
80
81 ; x = abs(z)
82 psubw xmm1, xmm0
83 psubw xmm5, xmm4
84
85 movdqa xmm2, [rcx]
86 movdqa xmm3, [rcx + 16]
87 mov rcx, [rdi + vp8_block_quant] ; quant_ptr
88
89 ; *zbin_ptr + zbin_oq_value
90 paddw xmm2, xmm7
91 paddw xmm3, xmm7
92
93 ; x - (*zbin_ptr + zbin_oq_value)
94 psubw xmm1, xmm2
95 psubw xmm5, xmm3
96 movdqa [rsp + abs_minus_zbin], xmm1
97 movdqa [rsp + abs_minus_zbin + 16], xmm5
98
99 ; add (zbin_ptr + zbin_oq_value) back
100 paddw xmm1, xmm2
101 paddw xmm5, xmm3
102
103 movdqa xmm2, [rdx]
104 movdqa xmm6, [rdx + 16]
105
106 movdqa xmm3, [rcx]
107 movdqa xmm7, [rcx + 16]
108
109 ; x + round
110 paddw xmm1, xmm2
111 paddw xmm5, xmm6
112
113 ; y = x * quant_ptr >> 16
114 pmulhw xmm3, xmm1
115 pmulhw xmm7, xmm5
116
117 ; y += x
118 paddw xmm1, xmm3
119 paddw xmm5, xmm7
120
121 movdqa [rsp + temp_qcoeff], xmm1
122 movdqa [rsp + temp_qcoeff + 16], xmm5
123
124 pxor xmm6, xmm6
125 ; zero qcoeff
126 movdqa [rsp + qcoeff], xmm6
127 movdqa [rsp + qcoeff + 16], xmm6
128
129 mov rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
130 mov rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr
131 mov [rsp + zrun_zbin_boost], rdx
132
133 %macro ZIGZAG_LOOP 1
134 ; x
135 movsx ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
136
137 ; if (x >= zbin)
138 sub cx, WORD PTR[rdx] ; x - zbin
139 lea rdx, [rdx + 2] ; zbin_boost_ptr++
140 jl .rq_zigzag_loop_%1 ; x < zbin
141
142 movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
143
144 ; downshift by quant_shift[rc]
145 movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc]
146 sar edi, cl ; also sets Z bit
147 je .rq_zigzag_loop_%1 ; !y
148 mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoef f[rc]
149 mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
150 .rq_zigzag_loop_%1:
151 %endmacro
152 ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
153 ZIGZAG_LOOP 0
154 ZIGZAG_LOOP 1
155 ZIGZAG_LOOP 4
156 ZIGZAG_LOOP 8
157 ZIGZAG_LOOP 5
158 ZIGZAG_LOOP 2
159 ZIGZAG_LOOP 3
160 ZIGZAG_LOOP 6
161 ZIGZAG_LOOP 9
162 ZIGZAG_LOOP 12
163 ZIGZAG_LOOP 13
164 ZIGZAG_LOOP 10
165 ZIGZAG_LOOP 7
166 ZIGZAG_LOOP 11
167 ZIGZAG_LOOP 14
168 ZIGZAG_LOOP 15
169
170 movdqa xmm2, [rsp + qcoeff]
171 movdqa xmm3, [rsp + qcoeff + 16]
172
173 mov rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr
174 mov rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
175
176 ; y ^ sz
177 pxor xmm2, xmm0
178 pxor xmm3, xmm4
179 ; x = (y ^ sz) - sz
180 psubw xmm2, xmm0
181 psubw xmm3, xmm4
182
183 ; dequant
184 movdqa xmm0, [rcx]
185 movdqa xmm1, [rcx + 16]
186
187 mov rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr
188
189 pmullw xmm0, xmm2
190 pmullw xmm1, xmm3
191
192 movdqa [rcx], xmm2 ; store qcoeff
193 movdqa [rcx + 16], xmm3
194 movdqa [rdi], xmm0 ; store dqcoeff
195 movdqa [rdi + 16], xmm1
196
197 mov rcx, [rsi + vp8_blockd_eob]
198
199 ; select the last value (in zig_zag order) for EOB
200 pcmpeqw xmm2, xmm6
201 pcmpeqw xmm3, xmm6
202 ; !
203 pcmpeqw xmm6, xmm6
204 pxor xmm2, xmm6
205 pxor xmm3, xmm6
206 ; mask inv_zig_zag
207 pand xmm2, [GLOBAL(inv_zig_zag)]
208 pand xmm3, [GLOBAL(inv_zig_zag + 16)]
209 ; select the max value
210 pmaxsw xmm2, xmm3
211 pshufd xmm3, xmm2, 00001110b
212 pmaxsw xmm2, xmm3
213 pshuflw xmm3, xmm2, 00001110b
214 pmaxsw xmm2, xmm3
215 pshuflw xmm3, xmm2, 00000001b
216 pmaxsw xmm2, xmm3
217 movd eax, xmm2
218 and eax, 0xff
219
220 mov BYTE PTR [rcx], al ; store eob
221
222 ; begin epilog
223 add rsp, stack_size
224 pop rsp
225 %if ABI_IS_32BIT
226 pop rsi
227 pop rdi
228 %else
229 %if LIBVPX_YASM_WIN64
230 pop rsi
231 pop rdi
232 %endif
233 %endif
234 RESTORE_GOT
235 RESTORE_XMM
236 pop rbp
237 ret
238
239 ; void vp8_fast_quantize_b_sse2 | arg
240 ; (BLOCK *b, | 0
241 ; BLOCKD *d) | 1
242
243 global sym(vp8_fast_quantize_b_sse2) PRIVATE
244 sym(vp8_fast_quantize_b_sse2):
245 push rbp
246 mov rbp, rsp
247 GET_GOT rbx
248
249 %if ABI_IS_32BIT
250 push rdi
251 push rsi
252 %else
253 %if LIBVPX_YASM_WIN64
254 push rdi
255 push rsi
256 %else
257 ; these registers are used for passing arguments
258 %endif
259 %endif
260
261 ; end prolog
262
263 %if ABI_IS_32BIT
264 mov rdi, arg(0) ; BLOCK *b
265 mov rsi, arg(1) ; BLOCKD *d
266 %else
267 %if LIBVPX_YASM_WIN64
268 mov rdi, rcx ; BLOCK *b
269 mov rsi, rdx ; BLOCKD *d
270 %else
271 ;mov rdi, rdi ; BLOCK *b
272 ;mov rsi, rsi ; BLOCKD *d
273 %endif
274 %endif
275
276 mov rax, [rdi + vp8_block_coeff]
277 mov rcx, [rdi + vp8_block_round]
278 mov rdx, [rdi + vp8_block_quant_fast]
279
280 ; z = coeff
281 movdqa xmm0, [rax]
282 movdqa xmm4, [rax + 16]
283
284 ; dup z so we can save sz
285 movdqa xmm1, xmm0
286 movdqa xmm5, xmm4
287
288 ; sz = z >> 15
289 psraw xmm0, 15
290 psraw xmm4, 15
291
292 ; x = abs(z) = (z ^ sz) - sz
293 pxor xmm1, xmm0
294 pxor xmm5, xmm4
295 psubw xmm1, xmm0
296 psubw xmm5, xmm4
297
298 ; x += round
299 paddw xmm1, [rcx]
300 paddw xmm5, [rcx + 16]
301
302 mov rax, [rsi + vp8_blockd_qcoeff]
303 mov rcx, [rsi + vp8_blockd_dequant]
304 mov rdi, [rsi + vp8_blockd_dqcoeff]
305
306 ; y = x * quant >> 16
307 pmulhw xmm1, [rdx]
308 pmulhw xmm5, [rdx + 16]
309
310 ; x = (y ^ sz) - sz
311 pxor xmm1, xmm0
312 pxor xmm5, xmm4
313 psubw xmm1, xmm0
314 psubw xmm5, xmm4
315
316 ; qcoeff = x
317 movdqa [rax], xmm1
318 movdqa [rax + 16], xmm5
319
320 ; x * dequant
321 movdqa xmm2, xmm1
322 movdqa xmm3, xmm5
323 pmullw xmm2, [rcx]
324 pmullw xmm3, [rcx + 16]
325
326 ; dqcoeff = x * dequant
327 movdqa [rdi], xmm2
328 movdqa [rdi + 16], xmm3
329
330 pxor xmm4, xmm4 ;clear all bits
331 pcmpeqw xmm1, xmm4
332 pcmpeqw xmm5, xmm4
333
334 pcmpeqw xmm4, xmm4 ;set all bits
335 pxor xmm1, xmm4
336 pxor xmm5, xmm4
337
338 pand xmm1, [GLOBAL(inv_zig_zag)]
339 pand xmm5, [GLOBAL(inv_zig_zag + 16)]
340
341 pmaxsw xmm1, xmm5
342
343 mov rcx, [rsi + vp8_blockd_eob]
344
345 ; now down to 8
346 pshufd xmm5, xmm1, 00001110b
347
348 pmaxsw xmm1, xmm5
349
350 ; only 4 left
351 pshuflw xmm5, xmm1, 00001110b
352
353 pmaxsw xmm1, xmm5
354
355 ; okay, just 2!
356 pshuflw xmm5, xmm1, 00000001b
357
358 pmaxsw xmm1, xmm5
359
360 movd eax, xmm1
361 and eax, 0xff
362
363 mov BYTE PTR [rcx], al ; store eob
364
365 ; begin epilog
366 %if ABI_IS_32BIT
367 pop rsi
368 pop rdi
369 %else
370 %if LIBVPX_YASM_WIN64
371 pop rsi
372 pop rdi
373 %endif
374 %endif
375
376 RESTORE_GOT
377 pop rbp
378 ret
379
380 SECTION_RODATA
381 align 16
382 inv_zig_zag:
383 dw 0x0001, 0x0002, 0x0006, 0x0007
384 dw 0x0003, 0x0005, 0x0008, 0x000d
385 dw 0x0004, 0x0009, 0x000c, 0x000e
386 dw 0x000a, 0x000b, 0x000f, 0x0010
OLDNEW
« no previous file with comments | « source/libvpx/vp8/encoder/vp8_asm_enc_offsets.c ('k') | source/libvpx/vp8/encoder/x86/quantize_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698