Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(298)

Side by Side Diff: chromium/patches/to_upstream/37_VP8_armv6_optimizations.patch

Issue 9290059: Initial commit of all previous Chrome build scripts. (Closed) Base URL: http://git.chromium.org/chromium/third_party/ffmpeg.git@master
Patch Set: Drop deprecated subfolder. Created 8 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 From f2a35674d0ab9fc1852c088482fd51bf12e5ed45 Mon Sep 17 00:00:00 2001
2 From: "Ronald S. Bultje" <rsbultje@gmail.com>
3 Date: Wed, 24 Aug 2011 13:58:37 -0700
4 Subject: [PATCH] VP8: armv6 optimizations.
5
6 From 52.503s (~40fps) to 27.973sec (~80fps) decoding of 480p sintel
7 trailer, i.e. a ~2x speedup overall, on a Nexus S.
8 ---
9 libavcodec/arm/Makefile | 3 +-
10 libavcodec/arm/asm.S | 12 +
11 libavcodec/arm/vp8dsp_armv6.S | 2328 ++++++++++++++++++++++++++++++++++++++
12 libavcodec/arm/vp8dsp_init_arm.c | 324 ++++--
13 libavcodec/arm/vp8dsp_neon.S | 29 -
14 5 files changed, 2577 insertions(+), 119 deletions(-)
15 create mode 100644 libavcodec/arm/vp8dsp_armv6.S
16
17 diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
18 index 3374f0e..cc5a2a7 100644
19 --- a/libavcodec/arm/Makefile
20 +++ b/libavcodec/arm/Makefile
21 @@ -11,7 +11,8 @@ ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fix ed_armv6.o
22 OBJS-$(CONFIG_VP5_DECODER) += arm/vp56dsp_init_arm.o
23 OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_init_arm.o
24 OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_arm.o
25 -ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o
26 +ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \
27 + arm/vp8dsp_armv6.o
28
29 OBJS-$(CONFIG_H264DSP) += arm/h264dsp_init_arm.o
30 OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
31 diff --git a/libavcodec/arm/asm.S b/libavcodec/arm/asm.S
32 index a7d3ace..c398f37 100644
33 --- a/libavcodec/arm/asm.S
34 +++ b/libavcodec/arm/asm.S
35 @@ -97,6 +97,12 @@ T add \rn, \rn, \rm
36 T ldr \rt, [\rn]
37 .endm
38
39 +.macro ldr_dpren rt, rn, rm:vararg
40 +A ldr \rt, [\rn, -\rm]
41 +T sub \rt, \rn, \rm
42 +T ldr \rt, [\rt]
43 +.endm
44 +
45 .macro ldr_post rt, rn, rm:vararg
46 A ldr \rt, [\rn], \rm
47 T ldr \rt, [\rn]
48 @@ -133,6 +139,12 @@ T ldrh \rt, [\rn]
49 T add \rn, \rn, \rm
50 .endm
51
52 +.macro ldrb_post rt, rn, rm
53 +A ldrb \rt, [\rn], \rm
54 +T ldrb \rt, [\rn]
55 +T add \rn, \rn, \rm
56 +.endm
57 +
58 .macro str_post rt, rn, rm:vararg
59 A str \rt, [\rn], \rm
60 T str \rt, [\rn]
61 diff --git a/libavcodec/arm/vp8dsp_armv6.S b/libavcodec/arm/vp8dsp_armv6.S
62 new file mode 100644
63 index 0000000..4e7b783
64 --- /dev/null
65 +++ b/libavcodec/arm/vp8dsp_armv6.S
66 @@ -0,0 +1,2328 @@
67 +/**
68 + * VP8 ARMv6 optimisations
69 + *
70 + * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
71 + * Copyright (c) 2010 Rob Clark <rob@ti.com>
72 + * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
73 + *
74 + * This file is part of Libav.
75 + *
76 + * Libav is free software; you can redistribute it and/or
77 + * modify it under the terms of the GNU Lesser General Public
78 + * License as published by the Free Software Foundation; either
79 + * version 2.1 of the License, or (at your option) any later version.
80 + *
81 + * Libav is distributed in the hope that it will be useful,
82 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
83 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
84 + * Lesser General Public License for more details.
85 + *
86 + * You should have received a copy of the GNU Lesser General Public
87 + * License along with Libav; if not, write to the Free Software
88 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
89 + *
90 + * This code was partially ported from libvpx, which uses this license:
91 + *
92 + * Use of this source code is governed by a BSD-style license
93 + * that can be found in the LICENSE file in the root of the source
94 + * tree. An additional intellectual property rights grant can be found
95 + * in the file PATENTS. All contributing project authors may
96 + * be found in the AUTHORS file in the root of the source tree.
97 + *
98 + * (Note that the "LICENSE", "AUTHORS" and "PATENTS" files can be
99 + * found in the libvpx source tree.)
100 + */
101 +
102 +#include "asm.S"
103 +
104 +@ idct
105 +
106 +@ void vp8_luma_dc_wht(DCTELEM block[4][4][16], DCTELEM dc[16])
107 +function ff_vp8_luma_dc_wht_armv6, export=1
108 + push {r4 - r10, lr}
109 +
110 + @ load dc[] and zero memory
111 + mov r12, #0
112 + ldr r2, [r1] @ dc0[0,1]
113 + ldr r3, [r1, #4] @ dc0[2,3]
114 + ldr r4, [r1, #8] @ dc1[0,1]
115 + ldr r5, [r1, #12] @ dc1[2,3]
116 + ldr r6, [r1, #16] @ dc2[0,1]
117 + ldr r7, [r1, #20] @ dc2[2,3]
118 + ldr r8, [r1, #24] @ dc3[0,1]
119 + ldr r9, [r1, #28] @ dc3[2,3]
120 + str r12,[r1]
121 + str r12,[r1, #4]
122 + str r12,[r1, #8]
123 + str r12,[r1, #12]
124 + str r12,[r1, #16]
125 + str r12,[r1, #20]
126 + str r12,[r1, #24]
127 + str r12,[r1, #28]
128 +
129 + @ loop1
130 + uadd16 r12, r2, r8 @ t0[0,1]
131 + uadd16 r14, r3, r9 @ t0[2,3]
132 + usub16 r2, r2, r8 @ t3[0,1]
133 + usub16 r3, r3, r9 @ t3[2,3]
134 + uadd16 r8, r4, r6 @ t1[0,1]
135 + uadd16 r9, r5, r7 @ t1[2,3]
136 + usub16 r4, r4, r6 @ t2[0,1]
137 + usub16 r5, r5, r7 @ t2[2,3]
138 +
139 + uadd16 r6, r12, r8 @ dc0[0,1]
140 + uadd16 r7, r14, r9 @ dc0[2,3]
141 + usub16 r12, r12, r8 @ dc2[0,1]
142 + usub16 r14, r14, r9 @ dc2[2,3]
143 + uadd16 r8, r2, r4 @ dc1[0,1]
144 + uadd16 r9, r3, r5 @ dc1[2,3]
145 + usub16 r2, r2, r4 @ dc3[0,1]
146 + usub16 r3, r3, r5 @ dc3[2,3]
147 +
148 + mov r1, #3
149 + orr r1, r1, #0x30000 @ 3 | 3 (round)
150 +
151 + @ "transpose"
152 + pkhbt r4, r6, r8, lsl #16 @ dc{0,1}[0]
153 + pkhtb r6, r8, r6, asr #16 @ dc{0,1}[1]
154 + pkhbt r5, r12, r2, lsl #16 @ dc{2,3}[0]
155 + pkhtb r12, r2, r12, asr #16 @ dc{2,3}[1]
156 + pkhbt r8, r7, r9, lsl #16 @ dc{0,1}[2]
157 + uadd16 r4, r4, r1
158 + uadd16 r5, r5, r1
159 + pkhtb r7, r9, r7, asr #16 @ dc{0,1}[3]
160 + pkhbt r2, r14, r3, lsl #16 @ dc{2,3}[2]
161 + pkhtb r14, r3, r14, asr #16 @ dc{2,3}[3]
162 +
163 + @ loop2
164 + uadd16 r9, r4, r7 @ t0[0,1]
165 + uadd16 r3, r5, r14 @ t0[2,3]
166 + usub16 r4, r4, r7 @ t3[0,1]
167 + usub16 r5, r5, r14 @ t3[2,3]
168 + uadd16 r7, r6, r8 @ t1[0,1]
169 + uadd16 r14, r12, r2 @ t1[2,3]
170 + usub16 r6, r6, r8 @ t2[0,1]
171 + usub16 r12, r12, r2 @ t2[2,3]
172 +
173 + uadd16 r8, r9, r7 @ block[0,1][0]
174 + uadd16 r2, r3, r14 @ block[2,3][0]
175 + usub16 r9, r9, r7 @ block[0,1][2]
176 + usub16 r3, r3, r14 @ block[2,3][2]
177 + uadd16 r7, r4, r6 @ block[0,1][1]
178 + uadd16 r14, r5, r12 @ block[2,3][1]
179 + usub16 r4, r4, r6 @ block[0,1][3]
180 + usub16 r5, r5, r12 @ block[2,3][3]
181 +
182 + @ store
183 + mov r6, r8, asr #19 @ block[1][0]
184 + mov r12, r7, asr #19 @ block[1][1]
185 + mov r1, r9, asr #19 @ block[1][2]
186 + mov r10, r4, asr #19 @ block[1][3]
187 + sxth r8, r8
188 + sxth r7, r7
189 + sxth r9, r9
190 + sxth r4, r4
191 + asr r8, #3 @ block[0][0]
192 + asr r7, #3 @ block[0][1]
193 + asr r9, #3 @ block[0][2]
194 + asr r4, #3 @ block[0][3]
195 +
196 + strh r8, [r0], #32
197 + strh r7, [r0], #32
198 + strh r9, [r0], #32
199 + strh r4, [r0], #32
200 + strh r6, [r0], #32
201 + strh r12,[r0], #32
202 + strh r1, [r0], #32
203 + strh r10,[r0], #32
204 +
205 + mov r6, r2, asr #19 @ block[3][0]
206 + mov r12, r14, asr #19 @ block[3][1]
207 + mov r1, r3, asr #19 @ block[3][2]
208 + mov r10, r5, asr #19 @ block[3][3]
209 + sxth r2, r2
210 + sxth r14, r14
211 + sxth r3, r3
212 + sxth r5, r5
213 + asr r2, #3 @ block[2][0]
214 + asr r14, #3 @ block[2][1]
215 + asr r3, #3 @ block[2][2]
216 + asr r5, #3 @ block[2][3]
217 +
218 + strh r2, [r0], #32
219 + strh r14,[r0], #32
220 + strh r3, [r0], #32
221 + strh r5, [r0], #32
222 + strh r6, [r0], #32
223 + strh r12,[r0], #32
224 + strh r1, [r0], #32
225 + strh r10,[r0], #32
226 +
227 + pop {r4 - r10, pc}
228 +endfunc
229 +
230 +@ void vp8_luma_dc_wht_dc(DCTELEM block[4][4][16], DCTELEM dc[16])
231 +function ff_vp8_luma_dc_wht_dc_armv6, export=1
232 + ldrsh r2, [r1]
233 + mov r3, #0
234 + add r2, r2, #3
235 + strh r3, [r1]
236 + asr r2, r2, #3
237 + .rept 16
238 + strh r2, [r0], #32
239 + .endr
240 + bx lr
241 +endfunc
242 +
243 +@ void vp8_idct_add(uint8_t *dst, DCTELEM block[16], int stride)
244 +function ff_vp8_idct_add_armv6, export=1
245 + push {r4 - r11, lr}
246 + sub sp, sp, #32
247 +
248 + mov r3, #0x00004E00 @ cos
249 + orr r3, r3, #0x0000007B @ cospi8sqrt2minus1 = 20091
250 + mov r4, #0x00008A00 @ sin
251 + orr r4, r4, #0x0000008C @ sinpi8sqrt2 = 35468
252 + mov r5, #0x2 @ i=2
253 +1:
254 + ldr r6, [r1, #8] @ i5 | i4 = block1[1] | block 1[0]
255 + ldr r12,[r1, #24] @ i13 | i12 = block3[1] | block 3[0]
256 + ldr r14,[r1, #16] @ i9 | i8 = block2[1] | block 2[0]
257 +
258 + smulwt r9, r3, r6 @ (ip[5] * cospi8sqrt2minus1) > > 16
259 + smulwb r7, r3, r6 @ (ip[4] * cospi8sqrt2minus1) > > 16
260 + smulwt r10, r4, r6 @ (ip[5] * sinpi8sqrt2) >> 16
261 + smulwb r8, r4, r6 @ (ip[4] * sinpi8sqrt2) >> 16
262 + pkhbt r7, r7, r9, lsl #16 @ 5c | 4c
263 + smulwt r11, r3, r12 @ (ip[13] * cospi8sqrt2minus1) >> 16
264 + pkhbt r8, r8, r10, lsl #16 @ 5s | 4s = t2 first ha lf
265 + uadd16 r6, r6, r7 @ 5c+5 | 4c+4 = t3 first ha lf
266 + smulwt r7, r4, r12 @ (ip[13] * sinpi8sqrt2) >> 16
267 + smulwb r9, r3, r12 @ (ip[12] * cospi8sqrt2minus1) >> 16
268 + smulwb r10, r4, r12 @ (ip[12] * sinpi8sqrt2) >> 16
269 +
270 + subs r5, r5, #1 @ i--
271 + pkhbt r9, r9, r11, lsl #16 @ 13c | 12c
272 + ldr r11,[r1] @ i1 | i0
273 + pkhbt r10, r10, r7, lsl #16 @ 13s | 12s = t3 second h alf
274 + uadd16 r7, r12, r9 @ 13c+13 | 12c+12 = t2 second h alf
275 + usub16 r7, r8, r7 @ c = t2
276 + uadd16 r6, r6, r10 @ d = t3
277 + uadd16 r10, r11, r14 @ a = t0
278 + usub16 r8, r11, r14 @ b = t1
279 + uadd16 r9, r10, r6 @ a+d = tmp{0,1}[0]
280 + usub16 r10, r10, r6 @ a-d = tmp{0,1}[3]
281 + uadd16 r6, r8, r7 @ b+c = tmp{0,1}[1]
282 + usub16 r7, r8, r7 @ b-c = tmp{0,1}[2]
283 + mov r8, #0
284 + str r6, [sp, #8] @ o5 | o4
285 + str r7, [sp, #16] @ o9 | o8
286 + str r10,[sp, #24] @ o13 | o12
287 + str r9, [sp], #4 @ o1 | o0
288 + str r8, [r1, #24]
289 + str r8, [r1, #16]
290 + str r8, [r1, #8]
291 + str r8, [r1], #4
292 + bne 1b
293 +
294 + mov r5, #0x2 @ i=2
295 + sub sp, sp, #8
296 +2:
297 + ldr r6, [sp, #8] @ i5 | i4 = tmp{0,1}[1]
298 + ldr r14,[sp, #4] @ i3 | i2 = tmp{2,3}[0]
299 + ldr r12,[sp, #12] @ i7 | i6 = tmp{2,3}[1]
300 + ldr r1, [sp], #16 @ i1 | i0 = tmp{0,1}[0]
301 + smulwt r9, r3, r6 @ (ip[5] * cospi8sqrt2minus1) > > 16
302 + smulwt r7, r3, r1 @ (ip[1] * cospi8sqrt2minus1) > > 16
303 + smulwt r10, r4, r6 @ (ip[5] * sinpi8sqrt2) >> 16
304 + smulwt r8, r4, r1 @ (ip[1] * sinpi8sqrt2) >> 16
305 + pkhbt r11, r1, r6, lsl #16 @ i4 | i0 = t0/t1 first half
306 + pkhbt r7, r7, r9, lsl #16 @ 5c | 1c
307 + pkhbt r8, r8, r10, lsl #16 @ 5s | 1s = temp1 = t2 first ha lf
308 + pkhtb r1, r6, r1, asr #16 @ i5 | i1
309 + uadd16 r1, r7, r1 @ 5c+5 | 1c+1 = temp2 (d) = t3 first half
310 + pkhbt r9, r14, r12, lsl #16 @ i6 | i2 = t0/t1 second half
311 + uadd16 r10, r11, r9 @ a = t0
312 + usub16 r9, r11, r9 @ b = t1
313 + pkhtb r6, r12, r14, asr #16 @ i7 | i3
314 + subs r5, r5, #0x1 @ i--
315 + smulwt r7, r3, r6 @ (ip[7] * cospi8sqrt2minus1) > > 16
316 + smulwt r11, r4, r6 @ (ip[7] * sinpi8sqrt2) >> 16
317 + smulwb r12, r3, r6 @ (ip[3] * cospi8sqrt2minus1) > > 16
318 + smulwb r14, r4, r6 @ (ip[3] * sinpi8sqrt2) >> 16
319 +
320 + pkhbt r7, r12, r7, lsl #16 @ 7c | 3c
321 + pkhbt r11, r14, r11, lsl #16 @ 7s | 3s = temp1 (d) = t3 seco nd half
322 + mov r14, #0x4 @ set up 4's
323 + orr r14, r14, #0x40000 @ 4|4
324 + uadd16 r6, r7, r6 @ 7c+7 | 3c+3 = temp2 (c) = t2 second half
325 + usub16 r12, r8, r6 @ c (o5 | o1) = t2
326 + uadd16 r6, r11, r1 @ d (o7 | o3) = t3
327 + uadd16 r10, r10, r14 @ t0 + 4
328 + uadd16 r9, r9, r14 @ t1 + 4
329 + uadd16 r7, r10, r6 @ a+d = dst{0,1}[0]
330 + usub16 r6, r10, r6 @ a-d = dst{0,1}[3]
331 + uadd16 r10, r9, r12 @ b+c = dst{0,1}[1]
332 + usub16 r1, r9, r12 @ b-c = dst{0,1}[2]
333 +
334 + mov r9, r6, asr #3 @ o[1][3]
335 + mov r12, r1, asr #3 @ o[1][2]
336 + pkhtb r8, r12, r7, asr #19 @ o[1][0,2]
337 + pkhtb r11, r9, r10, asr #19 @ o[1][1,3]
338 + ldr r12,[r0]
339 + ldr r9, [r0, r2]
340 + sxth r7, r7
341 + sxth r6, r6
342 + sxth r10, r10
343 + sxth r1, r1
344 + asr r7, #3 @ o[0][0]
345 + asr r10, #3 @ o[0][1]
346 + pkhbt r7, r7, r1, lsl #13 @ o[0][0,2]
347 + pkhbt r10, r10, r6, lsl #13 @ o[0][1,3]
348 +
349 + uxtab16 r7, r7, r12
350 + uxtab16 r10, r10, r12, ror #8
351 + uxtab16 r8, r8, r9
352 + uxtab16 r11, r11, r9, ror #8
353 + usat16 r7, #8, r7
354 + usat16 r10, #8, r10
355 + usat16 r8, #8, r8
356 + usat16 r11, #8, r11
357 + orr r7, r7, r10, lsl #8
358 + orr r8, r8, r11, lsl #8
359 + str r8, [r0, r2]
360 + str_post r7, r0, r2, lsl #1
361 +
362 + bne 2b
363 +
364 + pop {r4 - r11, pc}
365 +endfunc
366 +
367 +@ void vp8_idct_dc_add(uint8_t *dst, DCTELEM block[16], int stride)
368 +function ff_vp8_idct_dc_add_armv6, export=1
369 + push {r4 - r5, lr}
370 + ldrsh r3, [r1]
371 + mov r4, #0
372 + add r3, r3, #4
373 + asr r3, #3
374 + strh r4, [r1], #32
375 + ldr r4, [r0, r2]
376 + ldr_post r5, r0, r2, lsl #1
377 + pkhbt r3, r3, r3, lsl #16
378 +
379 + uxtab16 lr, r3, r5 @ a1+2 | a1+0
380 + uxtab16 r5, r3, r5, ror #8 @ a1+3 | a1+1
381 + uxtab16 r12, r3, r4
382 + uxtab16 r4, r3, r4, ror #8
383 + usat16 lr, #8, lr
384 + usat16 r5, #8, r5
385 + usat16 r12, #8, r12
386 + usat16 r4, #8, r4
387 + orr lr, lr, r5, lsl #8
388 + orr r12, r12, r4, lsl #8
389 + ldr r5, [r0]
390 + ldr r4, [r0, r2]
391 + sub r0, r0, r2, lsl #1
392 + str r12,[r0, r2]
393 + str_post lr, r0, r2, lsl #1
394 +
395 + uxtab16 lr, r3, r5
396 + uxtab16 r5, r3, r5, ror #8
397 + uxtab16 r12, r3, r4
398 + uxtab16 r4, r3, r4, ror #8
399 + usat16 lr, #8, lr
400 + usat16 r5, #8, r5
401 + usat16 r12, #8, r12
402 + usat16 r4, #8, r4
403 + orr lr, lr, r5, lsl #8
404 + orr r12, r12, r4, lsl #8
405 +
406 + str r12,[r0, r2]
407 + str_post lr, r0, r2, lsl #1
408 +
409 + pop {r4 - r5, pc}
410 +endfunc
411 +
412 +@ void vp8_idct_dc_add4uv(uint8_t *dst, DCTELEM block[4][16], int stride)
413 +function ff_vp8_idct_dc_add4uv_armv6, export=1
414 + push {lr}
415 +
416 + bl ff_vp8_idct_dc_add_armv6
417 + sub r0, r0, r2, lsl #2
418 + add r0, r0, #4
419 + bl ff_vp8_idct_dc_add_armv6
420 + sub r0, r0, #4
421 + bl ff_vp8_idct_dc_add_armv6
422 + sub r0, r0, r2, lsl #2
423 + add r0, r0, #4
424 + bl ff_vp8_idct_dc_add_armv6
425 +
426 + pop {pc}
427 +endfunc
428 +
429 +@ void vp8_idct_dc_add4y(uint8_t *dst, DCTELEM block[4][16], int stride)
430 +function ff_vp8_idct_dc_add4y_armv6, export=1
431 + push {lr}
432 +
433 + bl ff_vp8_idct_dc_add_armv6
434 + sub r0, r0, r2, lsl #2
435 + add r0, r0, #4
436 + bl ff_vp8_idct_dc_add_armv6
437 + sub r0, r0, r2, lsl #2
438 + add r0, r0, #4
439 + bl ff_vp8_idct_dc_add_armv6
440 + sub r0, r0, r2, lsl #2
441 + add r0, r0, #4
442 + bl ff_vp8_idct_dc_add_armv6
443 +
444 + pop {pc}
445 +endfunc
446 +
447 +@ loopfilter
448 +
449 +@ void vp8_v_loop_filter16_simple(uint8_t *dst, int stride, int flim)
450 +function ff_vp8_v_loop_filter16_simple_armv6, export=1
451 + push {r4 - r11, lr}
452 +
453 + ldr_dpren r3, r0, r1, lsl #1 @ p1
454 + ldr_dpren r4, r0, r1 @ p0
455 + ldr r5, [r0] @ q0
456 + ldr r6, [r0, r1] @ q1
457 + orr r2, r2, r2, lsl #16
458 + mov r9, #4 @ count
459 + mov lr, #0 @ need 0 in a couple places
460 + orr r12, r2, r2, lsl #8 @ splat int -> byte
461 + ldr r2, c0x80808080
462 +
463 +1:
464 + @ vp8_simple_filter_mask()
465 + uqsub8 r7, r3, r6 @ p1 - q1
466 + uqsub8 r8, r6, r3 @ q1 - p1
467 + uqsub8 r10, r4, r5 @ p0 - q0
468 + uqsub8 r11, r5, r4 @ q0 - p0
469 + orr r8, r8, r7 @ abs(p1 - q1)
470 + orr r10, r10, r11 @ abs(p0 - q0)
471 + uqadd8 r10, r10, r10 @ abs(p0 - q0) * 2
472 + uhadd8 r8, r8, lr @ abs(p1 - q2) >> 1
473 + uqadd8 r10, r10, r8 @ abs(p0 - q0)*2 + abs(p1 - q1) /2
474 + mvn r8, #0
475 + usub8 r10, r12, r10 @ compare to flimit. usub8 sets GE flags
476 + sel r10, r8, lr @ filter mask: F or 0
477 + cmp r10, #0
478 + beq 2f @ skip filtering if all masks a re 0x00
479 +
480 + @ vp8_simple_filter()
481 + eor r3, r3, r2 @ p1 offset to convert to a sig ned value
482 + eor r6, r6, r2 @ q1 offset to convert to a sig ned value
483 + eor r4, r4, r2 @ p0 offset to convert to a sig ned value
484 + eor r5, r5, r2 @ q0 offset to convert to a sig ned value
485 +
486 + qsub8 r3, r3, r6 @ vp8_filter = p1 - q1
487 + qsub8 r6, r5, r4 @ q0 - p0
488 + qadd8 r3, r3, r6 @ += q0 - p0
489 + ldr r7, c0x04040404
490 + qadd8 r3, r3, r6 @ += q0 - p0
491 + ldr r8, c0x03030303
492 + qadd8 r3, r3, r6 @ vp8_filter = p1-q1 + 3*(q0-p0 ))
493 + @STALL
494 + and r3, r3, r10 @ vp8_filter &= mask
495 +
496 + qadd8 r7, r3, r7 @ Filter1 = vp8_filter + 4
497 + qadd8 r8, r3, r8 @ Filter2 = vp8_filter + 3
498 +
499 + shadd8 r7, r7, lr
500 + shadd8 r8, r8, lr
501 + shadd8 r7, r7, lr
502 + shadd8 r8, r8, lr
503 + shadd8 r7, r7, lr @ Filter1 >>= 3
504 + shadd8 r8, r8, lr @ Filter2 >>= 3
505 +
506 + qsub8 r5, r5, r7 @ u = q0 - Filter1
507 + qadd8 r4, r4, r8 @ u = p0 + Filter2
508 + eor r5, r5, r2 @ *oq0 = u^0x80
509 + eor r4, r4, r2 @ *op0 = u^0x80
510 +T sub r7, r0, r1
511 + str r5, [r0] @ store oq0 result
512 +A str r4, [r0, -r1] @ store op0 result
513 +T str r4, [r7]
514 +
515 +2:
516 + subs r9, r9, #1 @ counter--
517 + add r0, r0, #4 @ next row
518 +T itttt ne
519 +A ldrne r3, [r0, -r1, lsl #1] @ p1
520 +T subne r3, r0, r1, lsl #1
521 +T ldrne r3, [r3] @ p1
522 +A ldrne r4, [r0, -r1] @ p0
523 +T subne r4, r0, r1
524 +T ldrne r4, [r4] @ p0
525 +T itt ne
526 + ldrne r5, [r0] @ q0
527 + ldrne r6, [r0, r1] @ q1
528 +
529 + bne 1b
530 +
531 + pop {r4 - r11, pc}
532 +endfunc
533 +
534 +c0x01010101: .long 0x01010101
535 +c0x03030303: .long 0x03030303
536 +c0x04040404: .long 0x04040404
537 +c0x7F7F7F7F: .long 0x7F7F7F7F
538 +c0x80808080: .long 0x80808080
539 +
540 +@ void vp8_v_loop_filter16_inner(uint8_t *dst, int stride,
541 +@ int fE, int fI, int hev_thresh)
542 +@ and
543 +@ void vp8_v_loop_filter8uv_inner(uint8_t *dstU, uint8_t *dstV, int stride,
544 +@ int fE, int fI, int hev_thresh)
545 +@ call:
546 +@ void vp8_v_loop_filter_inner(uint8_t *dst, int stride,
547 +@ int fE, int fI, int hev_thresh, int count)
548 +function ff_vp8_v_loop_filter_inner_armv6, export=1
549 + push {r4 - r11, lr}
550 +
551 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lin es
552 + ldr r5, [sp, #40] @ counter
553 + ldr r6, [sp, #36] @ load thresh address
554 + sub sp, sp, #16 @ create temp buffer
555 +
556 + ldr r10,[r0, r1] @ p2
557 + ldr_post r9, r0, r1, lsl #1 @ p3
558 + ldr r12,[r0, r1] @ p0
559 + ldr_post r11, r0, r1, lsl #1 @ p1
560 +
561 + orr r2, r2, r2, lsl #16
562 + orr r3, r3, r3, lsl #16
563 + orr r6, r6, r6, lsl #16
564 + orr r4, r2, r2, lsl #8 @ flimE splat int -> byte
565 + orr r2, r3, r3, lsl #8 @ flimI splat int -> byte
566 + orr r3, r6, r6, lsl #8 @ thresh splat int -> byte
567 +
568 +1:
569 + @ vp8_filter_mask() function
570 + @ calculate breakout conditions
571 + uqsub8 r6, r9, r10 @ p3 - p2
572 + uqsub8 r7, r10, r9 @ p2 - p3
573 + uqsub8 r8, r10, r11 @ p2 - p1
574 + uqsub8 r10, r11, r10 @ p1 - p2
575 +
576 + orr r6, r6, r7 @ abs (p3-p2)
577 + orr r8, r8, r10 @ abs (p2-p1)
578 + uqsub8 lr, r6, r2 @ compare to limit. lr: vp8_fil ter_mask
579 + uqsub8 r8, r8, r2 @ compare to limit
580 + uqsub8 r6, r11, r12 @ p1 - p0
581 + orr lr, lr, r8
582 + uqsub8 r7, r12, r11 @ p0 - p1
583 + ldr r10,[r0, r1] @ q1
584 + ldr_post r9, r0, r1, lsl #1 @ q0
585 + orr r6, r6, r7 @ abs (p1-p0)
586 + uqsub8 r7, r6, r2 @ compare to limit
587 + uqsub8 r8, r6, r3 @ compare to thresh -- save r8 for later
588 + orr lr, lr, r7
589 +
590 + uqsub8 r6, r11, r10 @ p1 - q1
591 + uqsub8 r7, r10, r11 @ q1 - p1
592 + uqsub8 r11, r12, r9 @ p0 - q0
593 + uqsub8 r12, r9, r12 @ q0 - p0
594 + orr r6, r6, r7 @ abs (p1-q1)
595 + ldr r7, c0x7F7F7F7F
596 + orr r12, r11, r12 @ abs (p0-q0)
597 + ldr_post r11, r0, r1 @ q2
598 + uqadd8 r12, r12, r12 @ abs (p0-q0) * 2
599 + and r6, r7, r6, lsr #1 @ abs (p1-q1) / 2
600 + uqsub8 r7, r9, r10 @ q0 - q1
601 + uqadd8 r12, r12, r6 @ abs (p0-q0)*2 + abs (p1-q1)/2
602 + uqsub8 r6, r10, r9 @ q1 - q0
603 + uqsub8 r12, r12, r4 @ compare to flimit
604 + uqsub8 r9, r11, r10 @ q2 - q1
605 +
606 + orr lr, lr, r12
607 +
608 + ldr_post r12, r0, r1 @ q3
609 + uqsub8 r10, r10, r11 @ q1 - q2
610 + orr r6, r7, r6 @ abs (q1-q0)
611 + orr r10, r9, r10 @ abs (q2-q1)
612 + uqsub8 r7, r6, r2 @ compare to limit
613 + uqsub8 r10, r10, r2 @ compare to limit
614 + uqsub8 r6, r6, r3 @ compare to thresh -- save r6 for later
615 + orr lr, lr, r7
616 + orr lr, lr, r10
617 +
618 + uqsub8 r10, r12, r11 @ q3 - q2
619 + uqsub8 r9, r11, r12 @ q2 - q3
620 +
621 + mvn r11, #0 @ r11 == -1
622 +
623 + orr r10, r10, r9 @ abs (q3-q2)
624 + uqsub8 r10, r10, r2 @ compare to limit
625 +
626 + mov r12, #0
627 + orr lr, lr, r10
628 + sub r0, r0, r1, lsl #2
629 +
630 + usub8 lr, r12, lr @ use usub8 instead of ssub8
631 + sel lr, r11, r12 @ filter mask: lr
632 +
633 + cmp lr, #0
634 + beq 2f @ skip filtering
635 +
636 + sub r0, r0, r1, lsl #1 @ move r0 pointer down by 6 lin es
637 +
638 + @vp8_hevmask() function
639 + @calculate high edge variance
640 + orr r10, r6, r8 @ calculate vp8_hevmask
641 +
642 + usub8 r10, r12, r10 @ use usub8 instead of ssub8
643 + sel r6, r12, r11 @ obtain vp8_hevmask: r6
644 +
645 + @vp8_filter() function
646 + ldr r8, [r0, r1] @ p0
647 + ldr_post r7, r0, r1, lsl #1 @ p1
648 + ldr r12, c0x80808080
649 + ldr r10,[r0, r1] @ q1
650 + ldr_post r9, r0, r1, lsl #1 @ q0
651 +
652 + eor r7, r7, r12 @ p1 offset to convert to a sig ned value
653 + eor r8, r8, r12 @ p0 offset to convert to a sig ned value
654 + eor r9, r9, r12 @ q0 offset to convert to a sig ned value
655 + eor r10, r10, r12 @ q1 offset to convert to a sig ned value
656 +
657 + str r9, [sp] @ store qs0 temporarily
658 + str r8, [sp, #4] @ store ps0 temporarily
659 + str r10,[sp, #8] @ store qs1 temporarily
660 + str r7, [sp, #12] @ store ps1 temporarily
661 +
662 + qsub8 r7, r7, r10 @ vp8_signed_char_clamp(ps1-qs1 )
663 + qsub8 r8, r9, r8 @ vp8_signed_char_clamp(vp8_fil ter + 3 * ( qs0 - ps0))
664 +
665 + and r7, r7, r6 @ vp8_filter (r7) &= hev
666 +
667 + qadd8 r7, r7, r8
668 + ldr r9, c0x03030303 @ r9 = 3 --modified for vp8
669 +
670 + qadd8 r7, r7, r8
671 + ldr r10, c0x04040404
672 +
673 + qadd8 r7, r7, r8
674 + and r7, r7, lr @ vp8_filter &= mask@
675 +
676 + qadd8 r8, r7, r9 @ Filter2 (r8) = vp8_signed_cha r_clamp(vp8_filter+3)
677 + qadd8 r7, r7, r10 @ vp8_filter = vp8_signed_char_ clamp(vp8_filter+4)
678 +
679 + mov r9, #0
680 + shadd8 r8, r8, r9 @ Filter2 >>= 3
681 + shadd8 r7, r7, r9 @ vp8_filter >>= 3
682 + shadd8 r8, r8, r9
683 + shadd8 r7, r7, r9
684 + shadd8 lr, r8, r9 @ lr: Filter2
685 + shadd8 r7, r7, r9 @ r7: filter
686 +
687 + @calculate output
688 +
689 + ldr r8, [sp] @ load qs0
690 + ldr r9, [sp, #4] @ load ps0
691 +
692 + ldr r10, c0x01010101
693 +
694 + qsub8 r8, r8, r7 @ u = vp8_signed_char_clamp(qs0 - vp8_filter)
695 + qadd8 r9, r9, lr @ u = vp8_signed_char_clamp(ps0 + Filter2)
696 +
697 + mov lr, #0
698 + sadd8 r7, r7, r10 @ vp8_filter += 1
699 + shadd8 r7, r7, lr @ vp8_filter >>= 1
700 +
701 + ldr r11,[sp, #12] @ load ps1
702 + ldr r10,[sp, #8] @ load qs1
703 +
704 + bic r7, r7, r6 @ vp8_filter &= ~hev
705 + sub r0, r0, r1, lsl #2
706 +
707 + qadd8 r11, r11, r7 @ u = vp8_signed_char_clamp(ps1 + vp8_filter)
708 + qsub8 r10, r10, r7 @ u = vp8_signed_char_clamp(qs1 - vp8_filter)
709 +
710 + eor r11, r11, r12 @ *op1 = u^0x80
711 + eor r9, r9, r12 @ *op0 = u^0x80
712 + eor r8, r8, r12 @ *oq0 = u^0x80
713 + eor r10, r10, r12 @ *oq1 = u^0x80
714 + str r9, [r0, r1] @ store op0 result
715 + str_post r11, r0, r1, lsl #1 @ store op1
716 + str r10,[r0, r1] @ store oq1
717 + str_post r8, r0, r1, lsl #1 @ store oq0 result
718 +
719 + sub r0, r0, r1, lsl #1
720 +
721 +2:
722 + add r0, r0, #4
723 + sub r0, r0, r1, lsl #2
724 +
725 + subs r5, r5, #1
726 +T ittt ne
727 + ldrne r10,[r0, r1] @ p2
728 +A ldrne r9, [r0], r1, lsl #1 @ p3
729 +T ldrne r9, [r0] @ p3
730 +T addne r0, r0, r1, lsl #1
731 +T ittt ne
732 + ldrne r12,[r0, r1] @ p0
733 +A ldrne r11,[r0], r1, lsl #1 @ p1
734 +T ldrne r11,[r0] @ p3
735 +T addne r0, r0, r1, lsl #1
736 +
737 + bne 1b
738 +
739 + add sp, sp, #16
740 + pop {r4 - r11, pc}
741 +endfunc
742 +
743 +@ void vp8_v_loop_filter16(uint8_t *dst, int stride,
744 +@ int fE, int fI, int hev_thresh)
745 +@ and
746 +@ void vp8_v_loop_filter8uv(uint8_t *dstU, uint8_t *dstV, int stride,
747 +@ int fE, int fI, int hev_thresh)
748 +@ call:
749 +@ void vp8_v_loop_filter(uint8_t *dst, int stride,
750 +@ int fE, int fI, int hev_thresh, int count)
751 +function ff_vp8_v_loop_filter_armv6, export=1
752 + push {r4 - r11, lr}
753 +
754 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lin es
755 + ldr r5, [sp, #40] @ counter
756 + ldr r6, [sp, #36] @ load thresh address
757 + sub sp, sp, #16 @ create temp buffer
758 +
759 + ldr r10,[r0, r1] @ p2
760 + ldr_post r9, r0, r1, lsl #1 @ p3
761 + ldr r12,[r0, r1] @ p0
762 + ldr_post r11, r0, r1, lsl #1 @ p1
763 +
764 + orr r2, r2, r2, lsl #16
765 + orr r3, r3, r3, lsl #16
766 + orr r6, r6, r6, lsl #16
767 + orr r4, r2, r2, lsl #8 @ flimE splat int -> byte
768 + orr r2, r3, r3, lsl #8 @ flimI splat int -> byte
769 + orr r3, r6, r6, lsl #8 @ thresh splat int -> byte
770 +
771 +1:
772 + @ vp8_filter_mask() function
773 + @ calculate breakout conditions
774 + uqsub8 r6, r9, r10 @ p3 - p2
775 + uqsub8 r7, r10, r9 @ p2 - p3
776 + uqsub8 r8, r10, r11 @ p2 - p1
777 + uqsub8 r10, r11, r10 @ p1 - p2
778 +
779 + orr r6, r6, r7 @ abs (p3-p2)
780 + orr r8, r8, r10 @ abs (p2-p1)
781 + uqsub8 lr, r6, r2 @ compare to limit. lr: vp8_fil ter_mask
782 + uqsub8 r8, r8, r2 @ compare to limit
783 +
784 + uqsub8 r6, r11, r12 @ p1 - p0
785 + orr lr, lr, r8
786 + uqsub8 r7, r12, r11 @ p0 - p1
787 + ldr r10,[r0, r1] @ q1
788 + ldr_post r9, r0, r1, lsl #1 @ q0
789 + orr r6, r6, r7 @ abs (p1-p0)
790 + uqsub8 r7, r6, r2 @ compare to limit
791 + uqsub8 r8, r6, r3 @ compare to thresh -- save r8 for later
792 + orr lr, lr, r7
793 +
794 + uqsub8 r6, r11, r10 @ p1 - q1
795 + uqsub8 r7, r10, r11 @ q1 - p1
796 + uqsub8 r11, r12, r9 @ p0 - q0
797 + uqsub8 r12, r9, r12 @ q0 - p0
798 + orr r6, r6, r7 @ abs (p1-q1)
799 + ldr r7, c0x7F7F7F7F
800 + orr r12, r11, r12 @ abs (p0-q0)
801 + ldr_post r11, r0, r1 @ q2
802 + uqadd8 r12, r12, r12 @ abs (p0-q0) * 2
803 + and r6, r7, r6, lsr #1 @ abs (p1-q1) / 2
804 + uqsub8 r7, r9, r10 @ q0 - q1
805 + uqadd8 r12, r12, r6 @ abs (p0-q0)*2 + abs (p1-q1)/2
806 + uqsub8 r6, r10, r9 @ q1 - q0
807 + uqsub8 r12, r12, r4 @ compare to flimit
808 + uqsub8 r9, r11, r10 @ q2 - q1
809 +
810 + orr lr, lr, r12
811 +
812 + ldr_post r12, r0, r1 @ q3
813 +
814 + uqsub8 r10, r10, r11 @ q1 - q2
815 + orr r6, r7, r6 @ abs (q1-q0)
816 + orr r10, r9, r10 @ abs (q2-q1)
817 + uqsub8 r7, r6, r2 @ compare to limit
818 + uqsub8 r10, r10, r2 @ compare to limit
819 + uqsub8 r6, r6, r3 @ compare to thresh -- save r6 for later
820 + orr lr, lr, r7
821 + orr lr, lr, r10
822 +
823 + uqsub8 r10, r12, r11 @ q3 - q2
824 + uqsub8 r9, r11, r12 @ q2 - q3
825 +
826 + mvn r11, #0 @ r11 == -1
827 +
828 + orr r10, r10, r9 @ abs (q3-q2)
829 + uqsub8 r10, r10, r2 @ compare to limit
830 +
831 + mov r12, #0
832 +
833 + orr lr, lr, r10
834 +
835 + usub8 lr, r12, lr @ use usub8 instead of ssub8
836 + sel lr, r11, r12 @ filter mask: lr
837 +
838 + cmp lr, #0
839 + beq 2f @ skip filtering
840 +
841 + @vp8_hevmask() function
842 + @calculate high edge variance
843 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 6 lin es
844 + sub r0, r0, r1, lsl #1
845 +
846 + orr r10, r6, r8
847 +
848 + usub8 r10, r12, r10
849 + sel r6, r12, r11 @ hev mask: r6
850 +
851 + @vp8_mbfilter() function
852 + @p2, q2 are only needed at the end. Do not need to load them in now.
853 + ldr r8, [r0, r1] @ p0
854 + ldr_post r7, r0, r1, lsl #1 @ p1
855 + ldr r12, c0x80808080
856 + ldr_post r9, r0, r1 @ q0
857 + ldr r10,[r0] @ q1
858 +
859 + eor r7, r7, r12 @ ps1
860 + eor r8, r8, r12 @ ps0
861 + eor r9, r9, r12 @ qs0
862 + eor r10, r10, r12 @ qs1
863 +
864 + qsub8 r12, r9, r8 @ vp8_signed_char_clamp(vp8_fil ter + 3 * ( qs0 - ps0))
865 + str r7, [sp, #12] @ store ps1 temporarily
866 + qsub8 r7, r7, r10 @ vp8_signed_char_clamp(ps1-qs1 )
867 + str r10,[sp, #8] @ store qs1 temporarily
868 + qadd8 r7, r7, r12
869 + str r9, [sp] @ store qs0 temporarily
870 + qadd8 r7, r7, r12
871 + str r8, [sp, #4] @ store ps0 temporarily
872 + qadd8 r7, r7, r12 @ vp8_filter: r7
873 +
874 + ldr r10, c0x03030303 @ r10 = 3 --modified for vp8
875 + ldr r9, c0x04040404
876 +
877 + and r7, r7, lr @ vp8_filter &= mask (lr is fre e)
878 +
879 + mov r12, r7 @ Filter2: r12
880 + and r12, r12, r6 @ Filter2 &= hev
881 +
882 + @save bottom 3 bits so that we round one side +4 and the other +3
883 + qadd8 r8, r12, r9 @ Filter1 (r8) = vp8_signed_cha r_clamp(Filter2+4)
884 + qadd8 r12, r12, r10 @ Filter2 (r12) = vp8_signed_ch ar_clamp(Filter2+3)
885 +
886 + mov r10, #0
887 + shadd8 r8, r8, r10 @ Filter1 >>= 3
888 + shadd8 r12, r12, r10 @ Filter2 >>= 3
889 + shadd8 r8, r8, r10
890 + shadd8 r12, r12, r10
891 + shadd8 r8, r8, r10 @ r8: Filter1
892 + shadd8 r12, r12, r10 @ r12: Filter2
893 +
894 + ldr r9, [sp] @ load qs0
895 + ldr r11,[sp, #4] @ load ps0
896 +
897 + qsub8 r9, r9, r8 @ qs0 = vp8_signed_char_clamp(q s0 - Filter1)
898 + qadd8 r11, r11, r12 @ ps0 = vp8_signed_char_clamp(p s0 + Filter2)
899 +
900 + bic r12, r7, r6 @ vp8_filter &= ~hev ( r6 is free)
901 +
902 + @roughly 3/7th difference across boundary
903 + mov lr, #0x1b @ 27
904 + mov r7, #0x3f @ 63
905 +
906 + sxtb16 r6, r12
907 + sxtb16 r10, r12, ror #8
908 + smlabb r8, r6, lr, r7
909 + smlatb r6, r6, lr, r7
910 + smlabb r7, r10, lr, r7
911 + smultb r10, r10, lr
912 + ssat r8, #8, r8, asr #7
913 + ssat r6, #8, r6, asr #7
914 + add r10, r10, #63
915 + ssat r7, #8, r7, asr #7
916 + ssat r10, #8, r10, asr #7
917 +
918 + ldr lr, c0x80808080
919 +
920 + pkhbt r6, r8, r6, lsl #16
921 + pkhbt r10, r7, r10, lsl #16
922 + uxtb16 r6, r6
923 + uxtb16 r10, r10
924 +
925 + sub r0, r0, r1
926 +
927 + orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
928 +
929 + qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs0 - u)
930 + qadd8 r10, r11, r10 @ s = vp8_signed_char_clamp(ps0 + u)
931 + eor r8, r8, lr @ *oq0 = s^0x80
932 + str r8, [r0] @ store *oq0
933 + sub r0, r0, r1
934 + eor r10, r10, lr @ *op0 = s^0x80
935 + str r10,[r0] @ store *op0
936 +
937 + @roughly 2/7th difference across boundary
938 + mov lr, #0x12 @ 18
939 + mov r7, #0x3f @ 63
940 +
941 + sxtb16 r6, r12
942 + sxtb16 r10, r12, ror #8
943 + smlabb r8, r6, lr, r7
944 + smlatb r6, r6, lr, r7
945 + smlabb r9, r10, lr, r7
946 + smlatb r10, r10, lr, r7
947 + ssat r8, #8, r8, asr #7
948 + ssat r6, #8, r6, asr #7
949 + ssat r9, #8, r9, asr #7
950 + ssat r10, #8, r10, asr #7
951 +
952 + ldr lr, c0x80808080
953 +
954 + pkhbt r6, r8, r6, lsl #16
955 + pkhbt r10, r9, r10, lsl #16
956 +
957 + ldr r9, [sp, #8] @ load qs1
958 + ldr r11, [sp, #12] @ load ps1
959 +
960 + uxtb16 r6, r6
961 + uxtb16 r10, r10
962 +
963 + sub r0, r0, r1
964 +
965 + orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
966 +
967 + qadd8 r11, r11, r10 @ s = vp8_signed_char_clamp(ps1 + u)
968 + qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs1 - u)
969 + eor r11, r11, lr @ *op1 = s^0x80
970 + str_post r11, r0, r1 @ store *op1
971 + eor r8, r8, lr @ *oq1 = s^0x80
972 + add r0, r0, r1, lsl #1
973 +
974 + mov r7, #0x3f @ 63
975 +
976 + str_post r8, r0, r1 @ store *oq1
977 +
978 + @roughly 1/7th difference across boundary
979 + mov lr, #0x9 @ 9
980 + ldr r9, [r0] @ load q2
981 +
982 + sxtb16 r6, r12
983 + sxtb16 r10, r12, ror #8
984 + smlabb r8, r6, lr, r7
985 + smlatb r6, r6, lr, r7
986 + smlabb r12, r10, lr, r7
987 + smlatb r10, r10, lr, r7
988 + ssat r8, #8, r8, asr #7
989 + ssat r6, #8, r6, asr #7
990 + ssat r12, #8, r12, asr #7
991 + ssat r10, #8, r10, asr #7
992 +
993 + sub r0, r0, r1, lsl #2
994 +
995 + pkhbt r6, r8, r6, lsl #16
996 + pkhbt r10, r12, r10, lsl #16
997 +
998 + sub r0, r0, r1
999 + ldr lr, c0x80808080
1000 +
1001 + ldr r11, [r0] @ load p2
1002 +
1003 + uxtb16 r6, r6
1004 + uxtb16 r10, r10
1005 +
1006 + eor r9, r9, lr
1007 + eor r11, r11, lr
1008 +
1009 + orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
1010 +
1011 + qadd8 r8, r11, r10 @ s = vp8_signed_char_clamp(ps2 + u)
1012 + qsub8 r10, r9, r10 @ s = vp8_signed_char_clamp(qs2 - u)
1013 + eor r8, r8, lr @ *op2 = s^0x80
1014 + str_post r8, r0, r1, lsl #2 @ store *op2
1015 + add r0, r0, r1
1016 + eor r10, r10, lr @ *oq2 = s^0x80
1017 + str_post r10, r0, r1, lsl #1 @ store *oq2
1018 +
1019 +2:
1020 + add r0, r0, #4
1021 + sub r0, r0, r1, lsl #3
1022 + subs r5, r5, #1
1023 +
1024 +T ittt ne
1025 + ldrne r10,[r0, r1] @ p2
1026 +A ldrne r9, [r0], r1, lsl #1 @ p3
1027 +T ldrne r9, [r0] @ p3
1028 +T addne r0, r0, r1, lsl #1
1029 +T ittt ne
1030 + ldrne r12,[r0, r1] @ p0
1031 +A ldrne r11,[r0], r1, lsl #1 @ p1
1032 +T ldrne r11,[r0] @ p3
1033 +T addne r0, r0, r1, lsl #1
1034 +
1035 + bne 1b
1036 +
1037 + add sp, sp, #16
1038 + pop {r4 - r11, pc}
1039 +endfunc
1040 +
1041 +.macro TRANSPOSE_MATRIX i0, i1, i2, i3, o3, o2, o1, o0
1042 + @ input: $0, $1, $2, $3
1043 + @ output: $4, $5, $6, $7
1044 + @ i0: 03 02 01 00
1045 + @ i1: 13 12 11 10
1046 + @ i2: 23 22 21 20
1047 + @ i3: 33 32 31 30
1048 + @ o3 o2 o1 o0
1049 +
1050 + uxtb16 \o1, \i1 @ xx 12 xx 10
1051 + uxtb16 \o0, \i0 @ xx 02 xx 00
1052 + uxtb16 \o3, \i3 @ xx 32 xx 30
1053 + uxtb16 \o2, \i2 @ xx 22 xx 20
1054 + orr \o1, \o0, \o1, lsl #8 @ 12 02 10 00
1055 + orr \o3, \o2, \o3, lsl #8 @ 32 22 30 20
1056 +
1057 + uxtb16 \i1, \i1, ror #8 @ xx 13 xx 11
1058 + uxtb16 \i3, \i3, ror #8 @ xx 33 xx 31
1059 + uxtb16 \i0, \i0, ror #8 @ xx 03 xx 01
1060 + uxtb16 \i2, \i2, ror #8 @ xx 23 xx 21
1061 + orr \i0, \i0, \i1, lsl #8 @ 13 03 11 01
1062 + orr \i2, \i2, \i3, lsl #8 @ 33 23 31 21
1063 +
1064 + pkhtb \o2, \o3, \o1, asr #16 @ 32 22 12 02 -- p1
1065 + pkhbt \o0, \o1, \o3, lsl #16 @ 30 20 10 00 -- p3
1066 +
1067 + pkhtb \o3, \i2, \i0, asr #16 @ 33 23 13 03 -- p0
1068 + pkhbt \o1, \i0, \i2, lsl #16 @ 31 21 11 01 -- p2
1069 +.endm
1070 +
1071 +@ void vp8_h_loop_filter16_simple(uint8_t *dst, int stride, int flim)
1072 +function ff_vp8_h_loop_filter16_simple_armv6, export=1
1073 + push {r4 - r11, lr}
1074 + orr r12, r2, r2, lsl #16
1075 + ldr r2, c0x80808080
1076 + orr r12, r12, r12, lsl #8
1077 +
1078 + @ load soure data to r7, r8, r9, r10
1079 + sub r0, r0, #2
1080 + ldr r8, [r0, r1]
1081 + ldr_post r7, r0, r1, lsl #1
1082 + ldr r10,[r0, r1]
1083 + ldr_post r9, r0, r1, lsl #1
1084 + add r0, r0, #2
1085 +
1086 + mov r11, #4 @ count (r11) for 4-in-parallel
1087 +1:
1088 + @transpose r7, r8, r9, r10 to r3, r4, r5, r6
1089 + TRANSPOSE_MATRIX r7, r8, r9, r10, r6, r5, r4, r3
1090 +
1091 + @ vp8_simple_filter_mask() function
1092 + uqsub8 r7, r3, r6 @ p1 - q1
1093 + uqsub8 r8, r6, r3 @ q1 - p1
1094 + uqsub8 r9, r4, r5 @ p0 - q0
1095 + uqsub8 r10, r5, r4 @ q0 - p0
1096 + orr r7, r7, r8 @ abs(p1 - q1)
1097 + orr r9, r9, r10 @ abs(p0 - q0)
1098 + mov r8, #0
1099 + uqadd8 r9, r9, r9 @ abs(p0 - q0) * 2
1100 + uhadd8 r7, r7, r8 @ abs(p1 - q1) / 2
1101 + uqadd8 r7, r7, r9 @ abs(p0 - q0)*2 + abs(p1 - q1) /2
1102 + mvn r10, #0 @ r10 == -1
1103 +
1104 + usub8 r7, r12, r7 @ compare to flimit
1105 + sel lr, r10, r8 @ filter mask
1106 +
1107 + cmp lr, #0
1108 + beq 2f @ skip filtering
1109 +
1110 + @vp8_simple_filter() function
1111 + eor r3, r3, r2 @ p1 offset to convert to a sig ned value
1112 + eor r6, r6, r2 @ q1 offset to convert to a sig ned value
1113 + eor r4, r4, r2 @ p0 offset to convert to a sig ned value
1114 + eor r5, r5, r2 @ q0 offset to convert to a sig ned value
1115 +
1116 + qsub8 r3, r3, r6 @ vp8_filter = p1 - q1
1117 + qsub8 r6, r5, r4 @ q0 - p0
1118 +
1119 + qadd8 r3, r3, r6 @ vp8_filter += q0 - p0
1120 + ldr r9, c0x03030303 @ r9 = 3
1121 +
1122 + qadd8 r3, r3, r6 @ vp8_filter += q0 - p0
1123 + ldr r7, c0x04040404
1124 +
1125 + qadd8 r3, r3, r6 @ vp8_filter = p1-q1 + 3*(q0-p0 ))
1126 + @STALL
1127 + and r3, r3, lr @ vp8_filter &= mask
1128 +
1129 + qadd8 r9, r3, r9 @ Filter2 = vp8_filter + 3
1130 + qadd8 r3, r3, r7 @ Filter1 = vp8_filter + 4
1131 +
1132 + shadd8 r9, r9, r8
1133 + shadd8 r3, r3, r8
1134 + shadd8 r9, r9, r8
1135 + shadd8 r3, r3, r8
1136 + shadd8 r9, r9, r8 @ Filter2 >>= 3
1137 + shadd8 r3, r3, r8 @ Filter1 >>= 3
1138 +
1139 + @calculate output
1140 + sub r0, r0, r1, lsl #2
1141 +
1142 + qadd8 r4, r4, r9 @ u = p0 + Filter2
1143 + qsub8 r5, r5, r3 @ u = q0 - Filter1
1144 + eor r4, r4, r2 @ *op0 = u^0x80
1145 + eor r5, r5, r2 @ *oq0 = u^0x80
1146 +
1147 + strb r4, [r0, #-1] @ store the result
1148 + mov r4, r4, lsr #8
1149 + strb_post r5, r0, r1
1150 + mov r5, r5, lsr #8
1151 +
1152 + strb r4, [r0, #-1]
1153 + mov r4, r4, lsr #8
1154 + strb_post r5, r0, r1
1155 + mov r5, r5, lsr #8
1156 +
1157 + strb r4, [r0, #-1]
1158 + mov r4, r4, lsr #8
1159 + strb_post r5, r0, r1
1160 + mov r5, r5, lsr #8
1161 +
1162 + strb r4, [r0, #-1]
1163 + strb_post r5, r0, r1
1164 +
1165 +2:
1166 + subs r11, r11, #1
1167 +
1168 + @ load soure data to r7, r8, r9, r10
1169 + sub r0, r0, #2
1170 +T ittt ne
1171 + ldrne r8, [r0, r1]
1172 +A ldrne r7, [r0], r1, lsl #1
1173 +T ldrne r7, [r0]
1174 +T addne r0, r0, r1, lsl #1
1175 +T ittt ne
1176 + ldrne r10,[r0, r1]
1177 +A ldrne r9, [r0], r1, lsl #1
1178 +T ldrne r9, [r0]
1179 +T addne r0, r0, r1, lsl #1
1180 + add r0, r0, #2
1181 +
1182 + bne 1b
1183 +
1184 + pop {r4 - r11, pc}
1185 +endfunc
1186 +
1187 +@ void vp8_h_loop_filter16_inner(uint8_t *dst, int stride,
1188 +@ int fE, int fI, int hev_thresh)
1189 +@ and
1190 +@ void vp8_h_loop_filter8uv_inner(uint8_t *dstU, uint8_t *dstV, int stride,
1191 +@ int fE, int fI, int hev_thresh)
1192 +@ call:
1193 +@ void vp8_h_loop_filter_inner(uint8_t *dst, int stride,
1194 +@ int fE, int fI, int hev_thresh, int count)
1195 +function ff_vp8_h_loop_filter_inner_armv6, export=1
1196 + push {r4 - r11, lr}
1197 +
1198 + sub r0, r0, #4 @ move r0 pointer down by 4
1199 + ldr r5, [sp, #40] @ counter
1200 + ldr r9, [sp, #36] @ load thresh address
1201 + sub sp, sp, #16 @ create temp buffer
1202 +
1203 + ldr r7, [r0, r1] @ transpose will make it into p 3-p0
1204 + ldr_post r6, r0, r1, lsl #1 @ load source data
1205 + ldr lr, [r0, r1]
1206 + ldr_post r8, r0, r1, lsl #1
1207 +
1208 + orr r2, r2, r2, lsl #16
1209 + orr r3, r3, r3, lsl #16
1210 + orr r9, r9, r9, lsl #16
1211 + orr r4, r2, r2, lsl #8 @ flimE splat int -> byte
1212 + orr r2, r3, r3, lsl #8 @ flimI splat int -> byte
1213 + orr r3, r9, r9, lsl #8 @ thresh splat int -> byte
1214 +
1215 +1:
1216 + @ vp8_filter_mask() function
1217 + @ calculate breakout conditions
1218 + @ transpose the source data for 4-in-parallel operation
1219 + TRANSPOSE_MATRIX r6, r7, r8, lr, r12, r11, r10, r9
1220 +
1221 + uqsub8 r7, r9, r10 @ p3 - p2
1222 + uqsub8 r8, r10, r9 @ p2 - p3
1223 + uqsub8 r9, r10, r11 @ p2 - p1
1224 + uqsub8 r10, r11, r10 @ p1 - p2
1225 + orr r7, r7, r8 @ abs (p3-p2)
1226 + orr r10, r9, r10 @ abs (p2-p1)
1227 + uqsub8 lr, r7, r2 @ compare to limit. lr: vp8_fil ter_mask
1228 + uqsub8 r10, r10, r2 @ compare to limit
1229 +
1230 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lin es
1231 +
1232 + orr lr, lr, r10
1233 +
1234 + uqsub8 r6, r11, r12 @ p1 - p0
1235 + uqsub8 r7, r12, r11 @ p0 - p1
1236 + add r0, r0, #4 @ move r0 pointer up by 4
1237 + orr r6, r6, r7 @ abs (p1-p0)
1238 + str r11,[sp, #12] @ save p1
1239 + uqsub8 r10, r6, r2 @ compare to limit
1240 + uqsub8 r11, r6, r3 @ compare to thresh
1241 + orr lr, lr, r10
1242 +
1243 + @ transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
1244 + @ transpose the source data for 4-in-parallel operation
1245 + str r11,[sp] @ push r11 to stack
1246 + ldr r7, [r0, r1]
1247 + ldr_post r6, r0, r1, lsl #1 @ load source data
1248 + str r12,[sp, #4] @ save current reg before load q0 - q3 data
1249 + str lr, [sp, #8]
1250 + ldr lr, [r0, r1]
1251 + ldr_post r8, r0, r1, lsl #1
1252 +
1253 + TRANSPOSE_MATRIX r6, r7, r8, lr, r12, r11, r10, r9
1254 +
1255 + ldr lr, [sp, #8] @ load back (f)limit accumulato r
1256 +
1257 + uqsub8 r6, r12, r11 @ q3 - q2
1258 + uqsub8 r7, r11, r12 @ q2 - q3
1259 + uqsub8 r12, r11, r10 @ q2 - q1
1260 + uqsub8 r11, r10, r11 @ q1 - q2
1261 + orr r6, r6, r7 @ abs (q3-q2)
1262 + orr r7, r12, r11 @ abs (q2-q1)
1263 + uqsub8 r6, r6, r2 @ compare to limit
1264 + uqsub8 r7, r7, r2 @ compare to limit
1265 + ldr r11,[sp, #4] @ load back p0
1266 + ldr r12,[sp, #12] @ load back p1
1267 + orr lr, lr, r6
1268 + orr lr, lr, r7
1269 +
1270 + uqsub8 r6, r11, r9 @ p0 - q0
1271 + uqsub8 r7, r9, r11 @ q0 - p0
1272 + uqsub8 r8, r12, r10 @ p1 - q1
1273 + uqsub8 r11, r10, r12 @ q1 - p1
1274 + orr r6, r6, r7 @ abs (p0-q0)
1275 + ldr r7, c0x7F7F7F7F
1276 + orr r8, r8, r11 @ abs (p1-q1)
1277 + uqadd8 r6, r6, r6 @ abs (p0-q0) * 2
1278 + and r8, r7, r8, lsr #1 @ abs (p1-q1) / 2
1279 + uqsub8 r11, r10, r9 @ q1 - q0
1280 + uqadd8 r6, r8, r6 @ abs (p0-q0)*2 + abs (p1-q1)/2
1281 + uqsub8 r12, r9, r10 @ q0 - q1
1282 + uqsub8 r6, r6, r4 @ compare to flimit
1283 +
1284 + orr r9, r11, r12 @ abs (q1-q0)
1285 + uqsub8 r8, r9, r2 @ compare to limit
1286 + uqsub8 r10, r9, r3 @ compare to thresh
1287 + orr lr, lr, r6
1288 + orr lr, lr, r8
1289 +
1290 + mvn r11, #0 @ r11 == -1
1291 + mov r12, #0
1292 +
1293 + usub8 lr, r12, lr
1294 + ldr r9, [sp] @ load the compared result
1295 + sel lr, r11, r12 @ filter mask: lr
1296 +
1297 + cmp lr, #0
1298 + beq 2f @ skip filtering
1299 +
1300 + @vp8_hevmask() function
1301 + @calculate high edge variance
1302 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lin es
1303 +
1304 + orr r9, r9, r10
1305 +
1306 + ldrh r7, [r0, #-2]
1307 + ldrh_post r8, r0, r1
1308 +
1309 + usub8 r9, r12, r9
1310 + sel r6, r12, r11 @ hev mask: r6
1311 +
1312 + @vp8_filter() function
1313 + @ load soure data to r6, r11, r12, lr
1314 + ldrh r9, [r0, #-2]
1315 + ldrh_post r10, r0, r1
1316 +
1317 + pkhbt r12, r7, r8, lsl #16
1318 +
1319 + ldrh r7, [r0, #-2]
1320 + ldrh_post r8, r0, r1
1321 +
1322 + pkhbt r11, r9, r10, lsl #16
1323 +
1324 + ldrh r9, [r0, #-2]
1325 + ldrh_post r10, r0, r1
1326 +
1327 + @ Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
1328 + str r6, [sp]
1329 + str lr, [sp, #4]
1330 +
1331 + pkhbt r6, r7, r8, lsl #16
1332 + pkhbt lr, r9, r10, lsl #16
1333 +
1334 + @transpose r12, r11, r6, lr to r7, r8, r9, r10
1335 + TRANSPOSE_MATRIX r12, r11, r6, lr, r10, r9, r8, r7
1336 +
1337 + @load back hev_mask r6 and filter_mask lr
1338 + ldr r12, c0x80808080
1339 + ldr r6, [sp]
1340 + ldr lr, [sp, #4]
1341 +
1342 + eor r7, r7, r12 @ p1 offset to convert to a sig ned value
1343 + eor r8, r8, r12 @ p0 offset to convert to a sig ned value
1344 + eor r9, r9, r12 @ q0 offset to convert to a sig ned value
1345 + eor r10, r10, r12 @ q1 offset to convert to a sig ned value
1346 +
1347 + str r9, [sp] @ store qs0 temporarily
1348 + str r8, [sp, #4] @ store ps0 temporarily
1349 + str r10,[sp, #8] @ store qs1 temporarily
1350 + str r7, [sp, #12] @ store ps1 temporarily
1351 +
1352 + qsub8 r7, r7, r10 @ vp8_signed_char_clamp(ps1-qs1 )
1353 + qsub8 r8, r9, r8 @ vp8_signed_char_clamp(vp8_fil ter + 3 * ( qs0 - ps0))
1354 +
1355 + and r7, r7, r6 @ vp8_filter (r7) &= hev (r7 : filter)
1356 +
1357 + qadd8 r7, r7, r8
1358 + ldr r9, c0x03030303 @ r9 = 3 --modified for vp8
1359 +
1360 + qadd8 r7, r7, r8
1361 + ldr r10, c0x04040404
1362 +
1363 + qadd8 r7, r7, r8
1364 +
1365 + and r7, r7, lr @ vp8_filter &= mask
1366 +
1367 + qadd8 r8, r7, r9 @ Filter2 (r8) = vp8_signed_cha r_clamp(vp8_filter+3)
1368 + qadd8 r7, r7, r10 @ vp8_filter = vp8_signed_char_ clamp(vp8_filter+4)
1369 +
1370 + mov r9, #0
1371 + shadd8 r8, r8, r9 @ Filter2 >>= 3
1372 + shadd8 r7, r7, r9 @ vp8_filter >>= 3
1373 + shadd8 r8, r8, r9
1374 + shadd8 r7, r7, r9
1375 + shadd8 lr, r8, r9 @ lr: filter2
1376 + shadd8 r7, r7, r9 @ r7: filter
1377 +
1378 + @calculate output
1379 + ldr r8, [sp] @ load qs0
1380 + ldr r9, [sp, #4] @ load ps0
1381 +
1382 + ldr r10, c0x01010101
1383 +
1384 + qsub8 r8, r8, r7 @ u = vp8_signed_char_clamp(qs0 - vp8_filter)
1385 + qadd8 r9, r9, lr @ u = vp8_signed_char_clamp(ps0 + Filter2)
1386 +
1387 + eor r8, r8, r12
1388 + eor r9, r9, r12
1389 +
1390 + mov lr, #0
1391 +
1392 + sadd8 r7, r7, r10
1393 + shadd8 r7, r7, lr
1394 +
1395 + ldr r10,[sp, #8] @ load qs1
1396 + ldr r11,[sp, #12] @ load ps1
1397 +
1398 + bic r7, r7, r6 @ r7: vp8_filter
1399 +
1400 + qsub8 r10, r10, r7 @ u = vp8_signed_char_clamp(qs1 - vp8_filter)
1401 + qadd8 r11, r11, r7 @ u = vp8_signed_char_clamp(ps1 + vp8_filter)
1402 + eor r10, r10, r12
1403 + eor r11, r11, r12
1404 +
1405 + sub r0, r0, r1, lsl #2
1406 +
1407 + @we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
1408 + TRANSPOSE_MATRIX r11, r9, r8, r10, lr, r12, r7, r6
1409 +
1410 + strh r6, [r0, #-2] @ store the result
1411 + mov r6, r6, lsr #16
1412 + strh_post r6, r0, r1
1413 +
1414 + strh r7, [r0, #-2]
1415 + mov r7, r7, lsr #16
1416 + strh_post r7, r0, r1
1417 +
1418 + strh r12, [r0, #-2]
1419 + mov r12, r12, lsr #16
1420 + strh_post r12, r0, r1
1421 +
1422 + strh lr, [r0, #-2]
1423 + mov lr, lr, lsr #16
1424 + strh_post lr, r0, r1
1425 +
1426 +2:
1427 + sub r0, r0, #4
1428 + subs r5, r5, #1
1429 +
1430 +T ittt ne
1431 + ldrne r7, [r0, r1]
1432 +A ldrne r6, [r0], r1, lsl #1 @ load source data
1433 +T ldrne r6, [r0] @ load source data
1434 +T addne r0, r0, r1, lsl #1
1435 +T ittt ne
1436 + ldrne lr, [r0, r1]
1437 +A ldrne r8, [r0], r1, lsl #1
1438 +T ldrne r8, [r0]
1439 +T addne r0, r0, r1, lsl #1
1440 +
1441 + bne 1b
1442 +
1443 + add sp, sp, #16
1444 + pop {r4 - r11, pc}
1445 +endfunc
1446 +
1447 +@ void vp8_h_loop_filter16(uint8_t *dst, int stride,
1448 +@ int fE, int fI, int hev_thresh)
1449 +@ and
1450 +@ void vp8_h_loop_filter8uv(uint8_t *dstU, uint8_t *dstV, int stride,
1451 +@ int fE, int fI, int hev_thresh)
1452 +@ call:
1453 +@ void vp8_h_loop_filter(uint8_t *dst, int stride,
1454 +@ int fE, int fI, int hev_thresh, int count)
1455 +function ff_vp8_h_loop_filter_armv6, export=1
1456 + push {r4 - r11, lr}
1457 +
1458 + sub r0, r0, #4 @ move r0 pointer down by 4
1459 + ldr r5, [sp, #40] @ counter
1460 + ldr r9, [sp, #36] @ load thresh address
1461 + sub sp, sp, #16 @ create temp buffer
1462 +
1463 + ldr r7, [r0, r1] @ transpose will make it into p 3-p0
1464 + ldr_post r6, r0, r1, lsl #1 @ load source data
1465 + ldr lr, [r0, r1]
1466 + ldr_post r8, r0, r1, lsl #1
1467 +
1468 + orr r2, r2, r2, lsl #16
1469 + orr r3, r3, r3, lsl #16
1470 + orr r9, r9, r9, lsl #16
1471 + orr r4, r2, r2, lsl #8 @ flimE splat int -> byte
1472 + orr r2, r3, r3, lsl #8 @ flimI splat int -> byte
1473 + orr r3, r9, r9, lsl #8 @ thresh splat int -> byte
1474 +
1475 +1:
1476 + @ vp8_filter_mask() function
1477 + @ calculate breakout conditions
1478 + @ transpose the source data for 4-in-parallel operation
1479 + TRANSPOSE_MATRIX r6, r7, r8, lr, r12, r11, r10, r9
1480 +
1481 + uqsub8 r7, r9, r10 @ p3 - p2
1482 + uqsub8 r8, r10, r9 @ p2 - p3
1483 + uqsub8 r9, r10, r11 @ p2 - p1
1484 + uqsub8 r10, r11, r10 @ p1 - p2
1485 + orr r7, r7, r8 @ abs (p3-p2)
1486 + orr r10, r9, r10 @ abs (p2-p1)
1487 + uqsub8 lr, r7, r2 @ compare to limit. lr: vp8_fil ter_mask
1488 + uqsub8 r10, r10, r2 @ compare to limit
1489 +
1490 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lin es
1491 +
1492 + orr lr, lr, r10
1493 +
1494 + uqsub8 r6, r11, r12 @ p1 - p0
1495 + uqsub8 r7, r12, r11 @ p0 - p1
1496 + add r0, r0, #4 @ move r0 pointer up by 4
1497 + orr r6, r6, r7 @ abs (p1-p0)
1498 + str r11,[sp, #12] @ save p1
1499 + uqsub8 r10, r6, r2 @ compare to limit
1500 + uqsub8 r11, r6, r3 @ compare to thresh
1501 + orr lr, lr, r10
1502 +
1503 + @ transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
1504 + @ transpose the source data for 4-in-parallel operation
1505 + str r11,[sp] @ push r11 to stack
1506 + ldr r7, [r0, r1]
1507 + ldr_post r6, r0, r1, lsl #1 @ load source data
1508 + str r12,[sp, #4] @ save current reg before load q0 - q3 data
1509 + str lr, [sp, #8]
1510 + ldr lr, [r0, r1]
1511 + ldr_post r8, r0, r1, lsl #1
1512 +
1513 + TRANSPOSE_MATRIX r6, r7, r8, lr, r12, r11, r10, r9
1514 +
1515 + ldr lr, [sp, #8] @ load back (f)limit accumulato r
1516 +
1517 + uqsub8 r6, r12, r11 @ q3 - q2
1518 + uqsub8 r7, r11, r12 @ q2 - q3
1519 + uqsub8 r12, r11, r10 @ q2 - q1
1520 + uqsub8 r11, r10, r11 @ q1 - q2
1521 + orr r6, r6, r7 @ abs (q3-q2)
1522 + orr r7, r12, r11 @ abs (q2-q1)
1523 + uqsub8 r6, r6, r2 @ compare to limit
1524 + uqsub8 r7, r7, r2 @ compare to limit
1525 + ldr r11,[sp, #4] @ load back p0
1526 + ldr r12,[sp, #12] @ load back p1
1527 + orr lr, lr, r6
1528 + orr lr, lr, r7
1529 +
1530 + uqsub8 r6, r11, r9 @ p0 - q0
1531 + uqsub8 r7, r9, r11 @ q0 - p0
1532 + uqsub8 r8, r12, r10 @ p1 - q1
1533 + uqsub8 r11, r10, r12 @ q1 - p1
1534 + orr r6, r6, r7 @ abs (p0-q0)
1535 + ldr r7, c0x7F7F7F7F
1536 + orr r8, r8, r11 @ abs (p1-q1)
1537 + uqadd8 r6, r6, r6 @ abs (p0-q0) * 2
1538 + and r8, r7, r8, lsr #1 @ abs (p1-q1) / 2
1539 + uqsub8 r11, r10, r9 @ q1 - q0
1540 + uqadd8 r6, r8, r6 @ abs (p0-q0)*2 + abs (p1-q1)/2
1541 + uqsub8 r12, r9, r10 @ q0 - q1
1542 + uqsub8 r6, r6, r4 @ compare to flimit
1543 +
1544 + orr r9, r11, r12 @ abs (q1-q0)
1545 + uqsub8 r8, r9, r2 @ compare to limit
1546 + uqsub8 r10, r9, r3 @ compare to thresh
1547 + orr lr, lr, r6
1548 + orr lr, lr, r8
1549 +
1550 + mvn r11, #0 @ r11 == -1
1551 + mov r12, #0
1552 +
1553 + usub8 lr, r12, lr
1554 + ldr r9, [sp] @ load the compared result
1555 + sel lr, r11, r12 @ filter mask: lr
1556 +
1557 + cmp lr, #0
1558 + beq 2f @ skip filtering
1559 +
1560 +
1561 + @vp8_hevmask() function
1562 + @calculate high edge variance
1563 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lin es
1564 +
1565 + orr r9, r9, r10
1566 +
1567 + ldrh r7, [r0, #-2]
1568 + ldrh_post r8, r0, r1
1569 +
1570 + usub8 r9, r12, r9
1571 + sel r6, r12, r11 @ hev mask: r6
1572 +
1573 +
1574 + @ vp8_mbfilter() function
1575 + @ p2, q2 are only needed at the end. do not need to load them in now.
1576 + @ Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
1577 + @ load soure data to r6, r11, r12, lr
1578 + ldrh r9, [r0, #-2]
1579 + ldrh_post r10, r0, r1
1580 +
1581 + pkhbt r12, r7, r8, lsl #16
1582 +
1583 + ldrh r7, [r0, #-2]
1584 + ldrh_post r8, r0, r1
1585 +
1586 + pkhbt r11, r9, r10, lsl #16
1587 +
1588 + ldrh r9, [r0, #-2]
1589 + ldrh_post r10, r0, r1
1590 +
1591 + str r6, [sp] @ save r6
1592 + str lr, [sp, #4] @ save lr
1593 +
1594 + pkhbt r6, r7, r8, lsl #16
1595 + pkhbt lr, r9, r10, lsl #16
1596 +
1597 + @transpose r12, r11, r6, lr to p1, p0, q0, q1
1598 + TRANSPOSE_MATRIX r12, r11, r6, lr, r10, r9, r8, r7
1599 +
1600 + @load back hev_mask r6 and filter_mask lr
1601 + ldr r12, c0x80808080
1602 + ldr r6, [sp]
1603 + ldr lr, [sp, #4]
1604 +
1605 + eor r7, r7, r12 @ ps1
1606 + eor r8, r8, r12 @ ps0
1607 + eor r9, r9, r12 @ qs0
1608 + eor r10, r10, r12 @ qs1
1609 +
1610 + qsub8 r12, r9, r8 @ vp8_signed_char_clamp(vp8_fil ter + 3 * ( qs0 - ps0))
1611 + str r7, [sp, #12] @ store ps1 temporarily
1612 + qsub8 r7, r7, r10 @ vp8_signed_char_clamp(ps1-qs1 )
1613 + str r10,[sp, #8] @ store qs1 temporarily
1614 + qadd8 r7, r7, r12
1615 + str r9, [sp] @ store qs0 temporarily
1616 + qadd8 r7, r7, r12
1617 + str r8, [sp, #4] @ store ps0 temporarily
1618 + qadd8 r7, r7, r12 @ vp8_filter: r7
1619 +
1620 + ldr r10, c0x03030303 @ r10 = 3 --modified for vp8
1621 + ldr r9, c0x04040404
1622 +
1623 + and r7, r7, lr @ vp8_filter &= mask (lr is fre e)
1624 +
1625 + mov r12, r7 @ Filter2: r12
1626 + and r12, r12, r6 @ Filter2 &= hev
1627 +
1628 + @save bottom 3 bits so that we round one side +4 and the other +3
1629 + qadd8 r8, r12, r9 @ Filter1 (r8) = vp8_signed_cha r_clamp(Filter2+4)
1630 + qadd8 r12, r12, r10 @ Filter2 (r12) = vp8_signed_ch ar_clamp(Filter2+3)
1631 +
1632 + mov r10, #0
1633 + shadd8 r8, r8, r10 @ Filter1 >>= 3
1634 + shadd8 r12, r12, r10 @ Filter2 >>= 3
1635 + shadd8 r8, r8, r10
1636 + shadd8 r12, r12, r10
1637 + shadd8 r8, r8, r10 @ r8: Filter1
1638 + shadd8 r12, r12, r10 @ r12: Filter2
1639 +
1640 + ldr r9, [sp] @ load qs0
1641 + ldr r11,[sp, #4] @ load ps0
1642 +
1643 + qsub8 r9, r9, r8 @ qs0 = vp8_signed_char_clamp(q s0 - Filter1)
1644 + qadd8 r11, r11, r12 @ ps0 = vp8_signed_char_clamp(p s0 + Filter2)
1645 +
1646 + bic r12, r7, r6 @vp8_filter &= ~hev ( r6 is free)
1647 +
1648 + @roughly 3/7th difference across boundary
1649 + mov lr, #0x1b @ 27
1650 + mov r7, #0x3f @ 63
1651 +
1652 + sxtb16 r6, r12
1653 + sxtb16 r10, r12, ror #8
1654 + smlabb r8, r6, lr, r7
1655 + smlatb r6, r6, lr, r7
1656 + smlabb r7, r10, lr, r7
1657 + smultb r10, r10, lr
1658 + ssat r8, #8, r8, asr #7
1659 + ssat r6, #8, r6, asr #7
1660 + add r10, r10, #63
1661 + ssat r7, #8, r7, asr #7
1662 + ssat r10, #8, r10, asr #7
1663 +
1664 + ldr lr, c0x80808080
1665 +
1666 + pkhbt r6, r8, r6, lsl #16
1667 + pkhbt r10, r7, r10, lsl #16
1668 + uxtb16 r6, r6
1669 + uxtb16 r10, r10
1670 +
1671 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lin es
1672 +
1673 + orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
1674 +
1675 + qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs0 - u)
1676 + qadd8 r10, r11, r10 @ s = vp8_signed_char_clamp(ps0 + u)
1677 + eor r8, r8, lr @ *oq0 = s^0x80
1678 + eor r10, r10, lr @ *op0 = s^0x80
1679 +
1680 + strb r10,[r0, #-1] @ store op0 result
1681 + strb_post r8, r0, r1 @ store oq0 result
1682 + mov r10, r10, lsr #8
1683 + mov r8, r8, lsr #8
1684 + strb r10,[r0, #-1]
1685 + strb_post r8, r0, r1
1686 + mov r10, r10, lsr #8
1687 + mov r8, r8, lsr #8
1688 + strb r10,[r0, #-1]
1689 + strb_post r8, r0, r1
1690 + mov r10, r10, lsr #8
1691 + mov r8, r8, lsr #8
1692 + strb r10,[r0, #-1]
1693 + strb_post r8, r0, r1
1694 +
1695 + @roughly 2/7th difference across boundary
1696 + mov lr, #0x12 @ 18
1697 + mov r7, #0x3f @ 63
1698 +
1699 + sxtb16 r6, r12
1700 + sxtb16 r10, r12, ror #8
1701 + smlabb r8, r6, lr, r7
1702 + smlatb r6, r6, lr, r7
1703 + smlabb r9, r10, lr, r7
1704 + smlatb r10, r10, lr, r7
1705 + ssat r8, #8, r8, asr #7
1706 + ssat r6, #8, r6, asr #7
1707 + ssat r9, #8, r9, asr #7
1708 + ssat r10, #8, r10, asr #7
1709 +
1710 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lin es
1711 +
1712 + pkhbt r6, r8, r6, lsl #16
1713 + pkhbt r10, r9, r10, lsl #16
1714 +
1715 + ldr r9, [sp, #8] @ load qs1
1716 + ldr r11,[sp, #12] @ load ps1
1717 + ldr lr, c0x80808080
1718 +
1719 + uxtb16 r6, r6
1720 + uxtb16 r10, r10
1721 +
1722 + add r0, r0, #2
1723 +
1724 + orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
1725 +
1726 + qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs1 - u)
1727 + qadd8 r10, r11, r10 @ s = vp8_signed_char_clamp(ps1 + u)
1728 + eor r8, r8, lr @ *oq1 = s^0x80
1729 + eor r10, r10, lr @ *op1 = s^0x80
1730 +
1731 + ldrb r11,[r0, #-5] @ load p2 for 1/7th difference across boundary
1732 + strb r10,[r0, #-4] @ store op1
1733 + strb r8, [r0, #-1] @ store oq1
1734 + ldrb_post r9, r0, r1 @ load q2 for 1/7th difference across boundary
1735 +
1736 + mov r10, r10, lsr #8
1737 + mov r8, r8, lsr #8
1738 +
1739 + ldrb r6, [r0, #-5]
1740 + strb r10,[r0, #-4]
1741 + strb r8, [r0, #-1]
1742 + ldrb_post r7, r0, r1
1743 +
1744 + mov r10, r10, lsr #8
1745 + mov r8, r8, lsr #8
1746 + orr r11, r11, r6, lsl #8
1747 + orr r9, r9, r7, lsl #8
1748 +
1749 + ldrb r6, [r0, #-5]
1750 + strb r10,[r0, #-4]
1751 + strb r8, [r0, #-1]
1752 + ldrb_post r7, r0, r1
1753 +
1754 + mov r10, r10, lsr #8
1755 + mov r8, r8, lsr #8
1756 + orr r11, r11, r6, lsl #16
1757 + orr r9, r9, r7, lsl #16
1758 +
1759 + ldrb r6, [r0, #-5]
1760 + strb r10,[r0, #-4]
1761 + strb r8, [r0, #-1]
1762 + ldrb_post r7, r0, r1
1763 + orr r11, r11, r6, lsl #24
1764 + orr r9, r9, r7, lsl #24
1765 +
1766 + @roughly 1/7th difference across boundary
1767 + eor r9, r9, lr
1768 + eor r11, r11, lr
1769 +
1770 + mov lr, #0x9 @ 9
1771 + mov r7, #0x3f @ 63
1772 +
1773 + sxtb16 r6, r12
1774 + sxtb16 r10, r12, ror #8
1775 + smlabb r8, r6, lr, r7
1776 + smlatb r6, r6, lr, r7
1777 + smlabb r12, r10, lr, r7
1778 + smlatb r10, r10, lr, r7
1779 + ssat r8, #8, r8, asr #7
1780 + ssat r6, #8, r6, asr #7
1781 + ssat r12, #8, r12, asr #7
1782 + ssat r10, #8, r10, asr #7
1783 +
1784 + sub r0, r0, r1, lsl #2
1785 +
1786 + pkhbt r6, r8, r6, lsl #16
1787 + pkhbt r10, r12, r10, lsl #16
1788 +
1789 + uxtb16 r6, r6
1790 + uxtb16 r10, r10
1791 +
1792 + ldr lr, c0x80808080
1793 +
1794 + orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
1795 +
1796 + qadd8 r8, r11, r10 @ s = vp8_signed_char_clamp(ps2 + u)
1797 + qsub8 r10, r9, r10 @ s = vp8_signed_char_clamp(qs2 - u)
1798 + eor r8, r8, lr @ *op2 = s^0x80
1799 + eor r10, r10, lr @ *oq2 = s^0x80
1800 +
1801 + strb r8, [r0, #-5] @ store *op2
1802 + strb_post r10, r0, r1 @ store *oq2
1803 + mov r8, r8, lsr #8
1804 + mov r10, r10, lsr #8
1805 + strb r8, [r0, #-5]
1806 + strb_post r10, r0, r1
1807 + mov r8, r8, lsr #8
1808 + mov r10, r10, lsr #8
1809 + strb r8, [r0, #-5]
1810 + strb_post r10, r0, r1
1811 + mov r8, r8, lsr #8
1812 + mov r10, r10, lsr #8
1813 + strb r8, [r0, #-5]
1814 + strb_post r10, r0, r1
1815 +
1816 + @adjust r0 pointer for next loop
1817 + sub r0, r0, #2
1818 +
1819 +2:
1820 + sub r0, r0, #4
1821 + subs r5, r5, #1
1822 +
1823 +T ittt ne
1824 + ldrne r7, [r0, r1]
1825 +A ldrne r6, [r0], r1, lsl #1 @ load source data
1826 +T ldrne r6, [r0]
1827 +T addne r0, r0, r1, lsl #1
1828 +T ittt ne
1829 + ldrne lr, [r0, r1]
1830 +A ldrne r8, [r0], r1, lsl #1
1831 +T ldrne r8, [r0]
1832 +T addne r0, r0, r1, lsl #1
1833 +
1834 + bne 1b
1835 +
1836 + add sp, sp, #16
1837 + pop {r4 - r11, pc}
1838 +endfunc
1839 +
1840 +@ MC
1841 +
1842 +@ void put_vp8_pixels16(uint8_t *dst, int dststride, uint8_t *src,
1843 +@ int srcstride, int h, int mx, int my)
1844 +function ff_put_vp8_pixels16_armv6, export=1
1845 + push {r4 - r11}
1846 + ldr r12,[sp, #32] @ h
1847 +1:
1848 + subs r12, r12, #2
1849 + ldr r5, [r2, #4]
1850 + ldr r6, [r2, #8]
1851 + ldr r7, [r2, #12]
1852 + ldr_post r4, r2, r3
1853 + ldr r9, [r2, #4]
1854 + ldr r10,[r2, #8]
1855 + ldr r11,[r2, #12]
1856 + ldr_post r8, r2, r3
1857 + strd r6, r7, [r0, #8]
1858 + strd_post r4, r5, r0, r1
1859 + strd r10, r11,[r0, #8]
1860 + strd_post r8, r9, r0, r1
1861 + bgt 1b
1862 + pop {r4 - r11}
1863 + bx lr
1864 +endfunc
1865 +
1866 +@ void put_vp8_pixels8(uint8_t *dst, int dststride, uint8_t *src,
1867 +@ int srcstride, int h, int mx, int my)
1868 +function ff_put_vp8_pixels8_armv6, export=1
1869 + push {r4 - r11}
1870 + ldr r12,[sp, #32] @ h
1871 +1:
1872 + subs r12, r12, #4
1873 + ldr r5, [r2, #4]
1874 + ldr_post r4, r2, r3
1875 + ldr r7, [r2, #4]
1876 + ldr_post r6, r2, r3
1877 + ldr r9, [r2, #4]
1878 + ldr_post r8, r2, r3
1879 + ldr r11,[r2, #4]
1880 + ldr_post r10, r2, r3
1881 + strd_post r4, r5, r0, r1
1882 + strd_post r6, r7, r0, r1
1883 + strd_post r8, r9, r0, r1
1884 + strd_post r10, r11, r0, r1
1885 + bgt 1b
1886 + pop {r4 - r11}
1887 + bx lr
1888 +endfunc
1889 +
1890 +@ void put_vp8_pixels4(uint8_t *dst, int dststride, uint8_t *src,
1891 +@ int srcstride, int h, int mx, int my)
1892 +function ff_put_vp8_pixels4_armv6, export=1
1893 + ldr r12, [sp, #0] @ h
1894 + push {r4 - r6, lr}
1895 +1:
1896 + subs r12, r12, #4
1897 + ldr r5, [r2, r3]
1898 + ldr_post r4, r2, r3, lsl #1
1899 + ldr lr, [r2, r3]
1900 + ldr_post r6, r2, r3, lsl #1
1901 + str r5, [r0, r1]
1902 + str_post r4, r0, r1, lsl #1
1903 + str lr, [r0, r1]
1904 + str_post r6, r0, r1, lsl #1
1905 + bgt 1b
1906 + pop {r4 - r6, pc}
1907 +endfunc
1908 +
1909 +@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1910 +@ arithmatic can be used to apply filters
1911 +const sixtap_filters_13245600, align=4
1912 + .short 2, 108, -11, 36, -8, 1, 0, 0
1913 + .short 3, 77, -16, 77, -16, 3, 0, 0
1914 + .short 1, 36, -8, 108, -11, 2, 0, 0
1915 +endconst
1916 +const fourtap_filters_1324, align=4
1917 + .short -6, 12, 123, -1
1918 + .short -9, 50, 93, -6
1919 + .short -6, 93, 50, -9
1920 + .short -1, 123, 12, -6
1921 +endconst
1922 +
1923 +@ void put_vp8_epel_h6(uint8_t *dst, int dststride, uint8_t *src,
1924 +@ int srcstride, int w, int h, int mx)
1925 +function ff_put_vp8_epel_h6_armv6, export=1
1926 + push {r4 - r11, lr}
1927 +
1928 + sub r2, r2, #2
1929 + movrel lr, sixtap_filters_13245600 - 16
1930 + ldr r12,[sp, #44] @ vp8_filter index
1931 + ldr r4, [sp, #36] @ width
1932 + add lr, lr, r12, lsl #3
1933 + sub r3, r3, r4 @ src_stride - block_width
1934 + sub r1, r1, r4 @ dst_stride - block_width
1935 + lsr r4, #2
1936 +
1937 + str r4, [sp, #36] @ "4-in-parallel" loop counter @40
1938 + str r3, [sp, #44] @ src_stride - block_width @48
1939 + push {r1} @ dst_stride - block_width @0
1940 + @ height @44
1941 +
1942 + ldr r1, [lr], #4 @ coefficients
1943 + ldr r3, [lr], #4
1944 + ldr lr, [lr]
1945 +1:
1946 + @ 3 loads, 10 shuffles and then mul/acc/add/shr
1947 + @ o0: i0/i1/i2/i3/i4/i5 -> i0/i2 (ld1) | i1/i3 (ld1) | i4/i5 (ld2)
1948 + @ o1: i1/i2/i3/i4/i5/i6 -> i1/i3 (ld1) | i2/i4 (ld2) | i5/i6 (ld2/3)
1949 + @ o2: i2/i3/i4/i5/i6/i7 -> i2/i4 (ld2) | i3/i5 (ld2) | i6/i7 (ld3)
1950 + @ o3: i3/i4/i5/i6/i7/i8 -> i3/i5 (ld2) | i4/i6 (ld2/3) | i7/i8 (ld3)
1951 + ldr r7, [r2, #5] @ ld3 -> src[5-8]
1952 + ldr r6, [r2, #2] @ ld2 -> src[2-5]
1953 + ldr r5, [r2], #4 @ ld1 -> src[0-3]
1954 +
1955 + pkhtb r7, r7, r7, asr #8 @ src[8,7,7,6]
1956 + uxtb16 r9, r6, ror #8 @ src[5] | src[3]
1957 + uxtb16 r6, r6 @ src[4] | src[2]
1958 + uxtb16 r8, r5, ror #8 @ src[3] | src[1]
1959 + uxtb16 r11, r7, ror #8 @ src[8] | src[7]
1960 + uxtb16 r7, r7 @ src[7] | src[6]
1961 + pkhtb r10, r9, r6, asr #16 @ src[5] | src[4]
1962 + uxtb16 r5, r5 @ src[2] | src[0]
1963 +
1964 + smuad r11, r11, lr @ filter[3][2] -> r11
1965 + subs r4, r4, #1
1966 + pkhbt r12, r10, r7, lsl #16 @ src[6] | src[4]
1967 + smuad r7, r7, lr @ filter[2][2] -> r7
1968 + smuad r5, r5, r1 @ filter[0][0] -> r5
1969 + smlad r11, r9, r1, r11 @ filter[3][0] -> r11
1970 + smlad r7, r9, r3, r7 @ filter[2][1] -> r7
1971 + smuad r9, r8, r1 @ filter[1][0] -> r9
1972 + smlad r5, r8, r3, r5 @ filter[0][1] -> r5
1973 + pkhtb r8, r12, r10, asr #16 @ src[6] | src[5]
1974 + smlad r11, r12, r3, r11 @ filter[3][1] -> r11
1975 + smlad r9, r6, r3, r9 @ filter[1][1] -> r9
1976 + smlad r5, r10, lr, r5 @ filter[0][2] -> r5
1977 + smlad r7, r6, r1, r7 @ filter[2][0] -> r7
1978 + smlad r9, r8, lr, r9 @ filter[1][2] -> r9
1979 +
1980 + add r5, r5, #0x40 @ round_shift_and_clamp[0]
1981 + add r9, r9, #0x40 @ round_shift_and_clamp[1]
1982 + add r7, r7, #0x40 @ round_shift_and_clamp[2]
1983 + add r11, r11, #0x40 @ round_shift_and_clamp[3]
1984 +
1985 + usat r5, #8, r5, asr #7
1986 + usat r9, #8, r9, asr #7
1987 + usat r7, #8, r7, asr #7
1988 + usat r11, #8, r11, asr #7
1989 +
1990 + strb r5, [r0], #1 @ store res[0]
1991 + strb r9, [r0], #1 @ store res[1]
1992 + strb r7, [r0], #1 @ store res[2]
1993 + strb r11,[r0], #1 @ store res[3]
1994 +
1995 + bne 1b
1996 +
1997 + ldr r12,[sp, #44] @ height = outer-loop counter
1998 + subs r12, r12, #1
1999 +T itttt ne
2000 + ldrne r4, [sp, #40] @ 4-in-parallel loop counter
2001 + ldrne r5, [sp, #48]
2002 + ldrne r6, [sp]
2003 + strne r12,[sp, #44]
2004 + add r2, r2, r5 @ move to next input/output lin es
2005 + add r0, r0, r6
2006 +
2007 + bne 1b
2008 +
2009 + add sp, sp, #4 @ restore stack after push{r1} above
2010 + pop {r4 - r11, pc}
2011 +endfunc
2012 +
2013 +@ void put_vp8_epel_v6(uint8_t *dst, int dststride, uint8_t *src,
2014 +@ int srcstride, int w, int h, int my)
2015 +function ff_put_vp8_epel_v6_armv6, export=1
2016 + push {r4 - r11, lr}
2017 +
2018 + movrel lr, sixtap_filters_13245600 - 16
2019 + ldr r12,[sp, #44] @ vp8_filter index
2020 + ldr r4, [sp, #36] @ width
2021 + add lr, lr, r12, lsl #3
2022 + sub r1, r1, r4 @ dst_stride - block_width
2023 + lsr r4, #2
2024 +
2025 + str r4, [sp, #36] @ "4-in-parallel" loop counter @40
2026 + str r3, [sp, #44] @ src_stride - block_width @48
2027 + push {r1} @ dst_stride - block_width @0
2028 + @ height @44
2029 +1:
2030 + add r1, r3, r3, lsl #1 @ stride * 3
2031 + ldr_dpren r5, r2, r3 @ src[0,1,2,3 + stride * 1]
2032 + ldr r6, [r2, r3] @ src[0,1,2,3 + stride * 3]
2033 + ldr r7, [r2, r3, lsl #1] @ src[0,1,2,3 + stride * 4]
2034 + ldr r8, [r2, r1] @ src[0,1,2,3 + stride * 5]
2035 +
2036 + @ byte -> word and "transpose"
2037 + uxtb16 r9, r5, ror #8 @ src[3 + stride*1] | src[1 + s tride*1]
2038 + uxtb16 r10, r6, ror #8 @ src[3 + stride*3] | src[1 + s tride*3]
2039 + uxtb16 r11, r7, ror #8 @ src[3 + stride*4] | src[1 + s tride*4]
2040 + uxtb16 r12, r8, ror #8 @ src[3 + stride*5] | src[1 + s tride*5]
2041 + uxtb16 r5, r5 @ src[2 + stride*1] | src[0 + s tride*1]
2042 + uxtb16 r6, r6 @ src[2 + stride*3] | src[0 + s tride*3]
2043 + uxtb16 r7, r7 @ src[2 + stride*4] | src[0 + s tride*4]
2044 + uxtb16 r8, r8 @ src[2 + stride*5] | src[0 + s tride*5]
2045 + pkhbt r1, r9, r10, lsl #16 @ src[1 + stride*3] | src[1 + s tride*1]
2046 + pkhtb r9, r10, r9, asr #16 @ src[3 + stride*3] | src[3 + s tride*1]
2047 + pkhbt r10, r11, r12, lsl #16 @ src[1 + stride*5] | src[1 + s tride*4]
2048 + pkhtb r11, r12, r11, asr #16 @ src[3 + stride*5] | src[3 + s tride*4]
2049 + pkhbt r12, r5, r6, lsl #16 @ src[0 + stride*3] | src[0 + s tride*1]
2050 + pkhtb r5, r6, r5, asr #16 @ src[2 + stride*3] | src[2 + s tride*1]
2051 + pkhbt r6, r7, r8, lsl #16 @ src[0 + stride*5] | src[0 + s tride*4]
2052 + pkhtb r7, r8, r7, asr #16 @ src[2 + stride*5] | src[2 + s tride*4]
2053 +
2054 + ldr r8, [lr, #4] @ stall - if only I had more re gisters...
2055 + smuad r12, r12, r8 @ filter[0][1]
2056 + smuad r1, r1, r8 @ filter[1][1]
2057 + smuad r5, r5, r8 @ filter[2][1]
2058 + smuad r9, r9, r8 @ filter[3][1]
2059 + ldr r8, [lr, #8] @ stall - if only I had more re gisters...
2060 + smlad r12, r6, r8, r12 @ filter[0][2]
2061 + smlad r1, r10, r8, r1 @ filter[1][2]
2062 + ldr_dpren r6, r2, r3, lsl #1 @ src[0,1,2,3 + stride * 0]
2063 + ldr r10,[r2], #4 @ src[0,1,2,3 + stride * 2]
2064 + smlad r5, r7, r8, r5 @ filter[2][2]
2065 + smlad r9, r11, r8, r9 @ filter[3][2]
2066 +
2067 + uxtb16 r7, r6, ror #8 @ src[3 + stride*0] | src[1 + s tride*0]
2068 + uxtb16 r11, r10, ror #8 @ src[3 + stride*2] | src[1 + s tride*2]
2069 + uxtb16 r6, r6 @ src[2 + stride*0] | src[0 + s tride*0]
2070 + uxtb16 r10, r10 @ src[2 + stride*2] | src[0 + s tride*2]
2071 +
2072 + pkhbt r8, r7, r11, lsl #16 @ src[1 + stride*2] | src[1 + s tride*0]
2073 + pkhtb r7, r11, r7, asr #16 @ src[3 + stride*2] | src[3 + s tride*0]
2074 + pkhbt r11, r6, r10, lsl #16 @ src[0 + stride*2] | src[0 + s tride*0]
2075 + pkhtb r6, r10, r6, asr #16 @ src[2 + stride*2] | src[2 + s tride*0]
2076 +
2077 + ldr r10,[lr] @ stall - if only I had more re gisters...
2078 + subs r4, r4, #1 @ counter--
2079 + smlad r12, r11, r10, r12 @ filter[0][0]
2080 + smlad r1, r8, r10, r1 @ filter[1][0]
2081 + smlad r5, r6, r10, r5 @ filter[2][0]
2082 + smlad r9, r7, r10, r9 @ filter[3][0]
2083 +
2084 + add r12, r12, #0x40 @ round_shift_and_clamp[0]
2085 + add r1, r1, #0x40 @ round_shift_and_clamp[1]
2086 + add r5, r5, #0x40 @ round_shift_and_clamp[2]
2087 + add r9, r9, #0x40 @ round_shift_and_clamp[3]
2088 +
2089 + usat r12, #8, r12, asr #7
2090 + usat r1, #8, r1, asr #7
2091 + usat r5, #8, r5, asr #7
2092 + usat r9, #8, r9, asr #7
2093 +
2094 + strb r12,[r0], #1 @ store res[0]
2095 + strb r1, [r0], #1 @ store res[1]
2096 + strb r5, [r0], #1 @ store res[2]
2097 + strb r9, [r0], #1 @ store res[3]
2098 +
2099 + bne 1b
2100 +
2101 + ldr r12,[sp, #44] @ height = outer-loop counter
2102 + subs r12, r12, #1
2103 +T itttt ne
2104 + ldrne r4, [sp, #40] @ 4-in-parallel loop counter
2105 + ldrne r6, [sp, #0]
2106 + subne r2, r2, r4, lsl #2
2107 + strne r12,[sp, #44]
2108 + add r0, r0, r6
2109 + add r2, r2, r3 @ move to next input/output lin es
2110 +
2111 + bne 1b
2112 +
2113 + add sp, sp, #4 @ restore stack after push{r1} above
2114 + pop {r4 - r11, pc}
2115 +endfunc
2116 +
2117 +@ void put_vp8_epel_h4(uint8_t *dst, int dststride, uint8_t *src,
2118 +@ int srcstride, int w, int h, int mx)
2119 +function ff_put_vp8_epel_h4_armv6, export=1
2120 + push {r4 - r11, lr}
2121 +
2122 + subs r2, r2, #1
2123 + movrel lr, fourtap_filters_1324 - 4
2124 + ldr r4, [sp, #36] @ width
2125 + ldr r12,[sp, #44] @ vp8_filter index
2126 + add lr, lr, r12, lsl #2
2127 + sub r3, r3, r4 @ src_stride - block_width
2128 + sub r1, r1, r4 @ dst_stride - block_width
2129 + ldr r5, [lr]
2130 + ldr r6, [lr, #4]
2131 + asr r4, #2
2132 +
2133 + ldr lr, [sp, #40] @ height = outer-loop counter
2134 + str r4, [sp, #36] @ "4-in-parallel" inner loop co unter
2135 +1:
2136 + @ 3 loads, 5 uxtb16s and then mul/acc/add/shr
2137 + @ o0: i0/i1/i2/i3 -> i0/i2(ld1) + i1/i3(ld1)
2138 + @ o1: i1/i2/i3/i4 -> i1/i3(ld1) + i2/i4(ld2)
2139 + @ o2: i2/i3/i4/i5 -> i2/i4(ld2) + i3/i5(ld2)
2140 + @ o3: i3/i4/i5/i6 -> i3/i5(ld2) + i4/i6(ld3)
2141 + ldr r9, [r2, #3] @ load source data
2142 + ldr r8, [r2, #2]
2143 + ldr r7, [r2], #4
2144 +
2145 + uxtb16 r9, r9, ror #8 @ src[6] | src[4]
2146 + uxtb16 r10, r8, ror #8 @ src[5] | src[3]
2147 + uxtb16 r8, r8 @ src[4] | src[2]
2148 + uxtb16 r11, r7, ror #8 @ src[3] | src[1]
2149 + uxtb16 r7, r7 @ src[2] | src[0]
2150 +
2151 + smuad r9, r9, r6 @ filter[3][1] -> r9
2152 + smuad r12, r10, r6 @ filter[2][1] -> r12
2153 + smuad r7, r7, r5 @ filter[0][0] -> r7
2154 + smlad r9, r10, r5, r9 @ filter[3][0] -> r9
2155 + smuad r10, r11, r5 @ filter[1][0] -> r10
2156 + smlad r12, r8, r5, r12 @ filter[2][0] -> r12
2157 + smlad r7, r11, r6, r7 @ filter[0][1] -> r7
2158 + smlad r10, r8, r6, r10 @ filter[1][1] -> r10
2159 +
2160 + subs r4, r4, #1 @ counter--
2161 +
2162 + add r7, r7, #0x40 @ round_shift_and_clamp[0]
2163 + add r10, r10, #0x40 @ round_shift_and_clamp[1]
2164 + add r12, r12, #0x40 @ round_shift_and_clamp[2]
2165 + add r9, r9, #0x40 @ round_shift_and_clamp[3]
2166 +
2167 + usat r7, #8, r7, asr #7
2168 + usat r10, #8, r10, asr #7
2169 + usat r12, #8, r12, asr #7
2170 + usat r9, #8, r9, asr #7
2171 +
2172 + strb r7, [r0], #1 @ store res[0]
2173 + strb r10,[r0], #1 @ store res[1]
2174 + strb r12,[r0], #1 @ store res[2]
2175 + strb r9, [r0], #1 @ store res[3]
2176 +
2177 + bne 1b
2178 +
2179 + subs lr, lr, #1
2180 +T it ne
2181 + ldrne r4, [sp, #36] @ 4-in-parallel loop counter
2182 + add r2, r2, r3 @ move to next input/output lin es
2183 + add r0, r0, r1
2184 +
2185 + bne 1b
2186 +
2187 + pop {r4 - r11, pc}
2188 +endfunc
2189 +
2190 +@ void put_vp8_epel_v4(uint8_t *dst, int dststride, uint8_t *src,
2191 +@ int srcstride, int w, int h, int my)
2192 +function ff_put_vp8_epel_v4_armv6, export=1
2193 + push {r4 - r11, lr}
2194 +
2195 + movrel lr, fourtap_filters_1324 - 4
2196 + ldr r12,[sp, #44] @ vp8_filter index
2197 + ldr r4, [sp, #36] @ width
2198 + add lr, lr, r12, lsl #2
2199 + sub r1, r1, r4 @ dst_stride - block_width
2200 + asr r4, #2
2201 + ldr r5, [lr]
2202 + ldr r6, [lr, #4]
2203 +
2204 + str r4, [sp, #36] @ "4-in-parallel" loop counter @40
2205 + str r3, [sp, #44] @ src_stride @48
2206 + push {r1} @ dst_stride - block_width @36
2207 + @ height @44
2208 +1:
2209 + ldr lr, [r2, r3, lsl #1] @ load source pixels
2210 + ldr r12,[r2, r3]
2211 + ldr_dpren r7, r2, r3
2212 + ldr r11,[r2], #4
2213 +
2214 + @ byte -> word and "transpose"
2215 + uxtb16 r8, lr, ror #8 @ src[3 + stride*3] | src[1 + s tride*3]
2216 + uxtb16 r9, r12, ror #8 @ src[3 + stride*2] | src[1 + s tride*2]
2217 + uxtb16 r3, r7, ror #8 @ src[3 + stride*0] | src[1 + s tride*0]
2218 + uxtb16 r1, r11, ror #8 @ src[3 + stride*1] | src[1 + s tride*1]
2219 + uxtb16 lr, lr @ src[2 + stride*3] | src[0 + s tride*3]
2220 + uxtb16 r12, r12 @ src[2 + stride*2] | src[0 + s tride*2]
2221 + uxtb16 r7, r7 @ src[2 + stride*0] | src[0 + s tride*0]
2222 + uxtb16 r11, r11 @ src[2 + stride*1] | src[0 + s tride*1]
2223 + pkhbt r10, r1, r8, lsl #16 @ src[1 + stride*3] | src[1 + s tride*1]
2224 + pkhtb r1, r8, r1, asr #16 @ src[3 + stride*3] | src[3 + s tride*1]
2225 + pkhbt r8, r3, r9, lsl #16 @ src[1 + stride*2] | src[1 + s tride*0]
2226 + pkhtb r3, r9, r3, asr #16 @ src[3 + stride*2] | src[3 + s tride*0]
2227 + pkhbt r9, r11, lr, lsl #16 @ src[0 + stride*3] | src[0 + s tride*1]
2228 + pkhtb r11, lr, r11, asr #16 @ src[2 + stride*3] | src[2 + s tride*1]
2229 + pkhbt lr, r7, r12, lsl #16 @ src[0 + stride*2] | src[0 + s tride*0]
2230 + pkhtb r7, r12, r7, asr #16 @ src[2 + stride*2] | src[2 + s tride*0]
2231 +
2232 + smuad r9, r9, r6 @ filter[0][1]
2233 + smuad r10, r10, r6 @ filter[1][1]
2234 + smuad r11, r11, r6 @ filter[2][1]
2235 + smuad r1, r1, r6 @ filter[3][1]
2236 + smlad r9, lr, r5, r9 @ filter[0][0]
2237 + smlad r10, r8, r5, r10 @ filter[1][0]
2238 + smlad r11, r7, r5, r11 @ filter[2][0]
2239 + smlad r1, r3, r5, r1 @ filter[3][0]
2240 +
2241 + subs r4, r4, #1 @ counter--
2242 + ldr r3, [sp, #48] @ FIXME prevent clobber of r3 a bove?
2243 +
2244 + add r9, r9, #0x40 @ round_shift_and_clamp[0]
2245 + add r10, r10, #0x40 @ round_shift_and_clamp[1]
2246 + add r11, r11, #0x40 @ round_shift_and_clamp[2]
2247 + add r1, r1, #0x40 @ round_shift_and_clamp[3]
2248 +
2249 + usat r9, #8, r9, asr #7
2250 + usat r10, #8, r10, asr #7
2251 + usat r11, #8, r11, asr #7
2252 + usat r1, #8, r1, asr #7
2253 +
2254 + strb r9, [r0], #1 @ store result
2255 + strb r10,[r0], #1
2256 + strb r11,[r0], #1
2257 + strb r1, [r0], #1
2258 +
2259 + bne 1b
2260 +
2261 + ldr r12,[sp, #44] @ height = outer-loop counter
2262 + subs r12, r12, #1
2263 +T ittt ne
2264 + ldrne r4, [sp, #40] @ 4-in-parallel loop counter
2265 + ldrne r9, [sp, #0]
2266 + strne r12,[sp, #44]
2267 + sub r2, r2, r4, lsl #2
2268 + add r0, r0, r9
2269 + add r2, r2, r3 @ move to next input/output lin es
2270 +
2271 + bne 1b
2272 +
2273 + add sp, sp, #4 @ restore stack after push{r1} above
2274 + pop {r4 - r11, pc}
2275 +endfunc
2276 +
2277 +@ void put_vp8_bilin_h(uint8_t *dst, int dststride, uint8_t *src,
2278 +@ int srcstride, int w, int h, int mx)
2279 +function ff_put_vp8_bilin_h_armv6, export=1
2280 + push {r4 - r9, lr}
2281 +
2282 + ldr r8, [sp, #36] @ vp8_filter index
2283 + ldr r12,[sp, #32] @ height = outer-loop counter
2284 + ldr r4, [sp, #28] @ width
2285 + lsl r5, r8, #16 @ mx << 16
2286 + sub r3, r3, r4 @ src_stride - block_width
2287 + sub r1, r1, r4 @ dst_stride - block_width
2288 + asr r4, #2
2289 + sub r5, r5, r8 @ (mx << 16) | (-mx)
2290 + str r4, [sp, #28] @ "4-in-parallel" loop counter
2291 + add r5, r5, #8 @ (8 - mx) | (mx << 16) = filte r coefficients
2292 +1:
2293 + ldrb r6, [r2], #1 @ load source data
2294 + ldrb r7, [r2], #1
2295 + ldrb r8, [r2], #1
2296 + ldrb r9, [r2], #1
2297 + ldrb lr, [r2]
2298 +
2299 + pkhbt r6, r6, r7, lsl #16 @ src[1] | src[0]
2300 + pkhbt r7, r7, r8, lsl #16 @ src[2] | src[1]
2301 + pkhbt r8, r8, r9, lsl #16 @ src[3] | src[2]
2302 + pkhbt r9, r9, lr, lsl #16 @ src[4] | src[3]
2303 +
2304 + smuad r6, r6, r5 @ apply the filter
2305 + smuad r7, r7, r5
2306 + smuad r8, r8, r5
2307 + smuad r9, r9, r5
2308 +
2309 + subs r4, r4, #1 @ counter--
2310 +
2311 + add r6, r6, #0x4 @ round_shift_and_clamp
2312 + add r7, r7, #0x4
2313 + add r8, r8, #0x4
2314 + add r9, r9, #0x4
2315 +
2316 + asr r6, #3
2317 + asr r7, #3
2318 + pkhbt r6, r6, r8, lsl #13
2319 + pkhbt r7, r7, r9, lsl #13
2320 + orr r6, r6, r7, lsl #8
2321 + str r6, [r0], #4 @ store result
2322 +
2323 + bne 1b
2324 +
2325 + ldr r4, [sp, #28] @ 4-in-parallel loop counter
2326 + subs r12, r12, #1
2327 +
2328 + add r2, r2, r3 @ move to next input/output lin es
2329 + add r0, r0, r1
2330 +
2331 + bne 1b
2332 +
2333 + pop {r4 - r9, pc}
2334 +endfunc
2335 +
2336 +@ void put_vp8_bilin_v(uint8_t *dst, int dststride, uint8_t *src,
2337 +@ int srcstride, int w, int h, int my)
2338 +function ff_put_vp8_bilin_v_armv6, export=1
2339 + push {r4 - r11, lr}
2340 +
2341 + ldr r11,[sp, #44] @ vp8_filter index
2342 + ldr r4, [sp, #36] @ width
2343 + mov r5, r11, lsl #16 @ mx << 16
2344 + ldr r12,[sp, #40] @ height = outer-loop counter
2345 + sub r1, r1, r4
2346 + sub r5, r5, r11 @ (mx << 16) | (-mx)
2347 + asr r4, #2
2348 + add r5, r5, #8 @ (8 - mx) | (mx << 16) = filte r coefficients
2349 + str r4, [sp, #36] @ "4-in-parallel" loop counter
2350 +1:
2351 + ldrb r10,[r2, r3] @ load the data
2352 + ldrb r6, [r2], #1
2353 + ldrb r11,[r2, r3]
2354 + ldrb r7, [r2], #1
2355 + ldrb lr, [r2, r3]
2356 + ldrb r8, [r2], #1
2357 + ldrb r9, [r2, r3]
2358 + pkhbt r6, r6, r10, lsl #16
2359 + ldrb r10,[r2], #1
2360 + pkhbt r7, r7, r11, lsl #16
2361 + pkhbt r8, r8, lr, lsl #16
2362 + pkhbt r9, r10, r9, lsl #16
2363 +
2364 + smuad r6, r6, r5 @ apply the filter
2365 + smuad r7, r7, r5
2366 + smuad r8, r8, r5
2367 + smuad r9, r9, r5
2368 +
2369 + subs r4, r4, #1 @ counter--
2370 +
2371 + add r6, r6, #0x4 @ round_shift_and_clamp
2372 + add r7, r7, #0x4
2373 + add r8, r8, #0x4
2374 + add r9, r9, #0x4
2375 +
2376 + asr r6, #3
2377 + asr r7, #3
2378 + pkhbt r6, r6, r8, lsl #13
2379 + pkhbt r7, r7, r9, lsl #13
2380 + orr r6, r6, r7, lsl #8
2381 + str r6, [r0], #4 @ store result
2382 +
2383 + bne 1b
2384 +
2385 + ldr r4, [sp, #36] @ 4-in-parallel loop counter
2386 + subs r12, r12, #1
2387 +
2388 + add r2, r2, r3 @ move to next input/output lin es
2389 + add r0, r0, r1
2390 + sub r2, r2, r4, lsl #2
2391 +
2392 + bne 1b
2393 + pop {r4 - r11, pc}
2394 +endfunc
2395 diff --git a/libavcodec/arm/vp8dsp_init_arm.c b/libavcodec/arm/vp8dsp_init_arm.c
2396 index 269c6e3..74d9581 100644
2397 --- a/libavcodec/arm/vp8dsp_init_arm.c
2398 +++ b/libavcodec/arm/vp8dsp_init_arm.c
2399 @@ -19,13 +19,17 @@
2400 #include <stdint.h>
2401 #include "libavcodec/vp8dsp.h"
2402
2403 -void ff_vp8_luma_dc_wht_neon(DCTELEM block[4][4][16], DCTELEM dc[16]);
2404 -void ff_vp8_luma_dc_wht_dc_neon(DCTELEM block[4][4][16], DCTELEM dc[16]);
2405 +void ff_vp8_luma_dc_wht_dc_armv6(DCTELEM block[4][4][16], DCTELEM dc[16]);
2406
2407 -void ff_vp8_idct_add_neon(uint8_t *dst, DCTELEM block[16], int stride);
2408 -void ff_vp8_idct_dc_add_neon(uint8_t *dst, DCTELEM block[16], int stride);
2409 -void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, DCTELEM block[4][16], int stride);
2410 -void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, DCTELEM block[4][16], int stride) ;
2411 +#define idct_funcs(opt) \
2412 +void ff_vp8_luma_dc_wht_ ## opt(DCTELEM block[4][4][16], DCTELEM dc[16]); \
2413 +void ff_vp8_idct_add_ ## opt(uint8_t *dst, DCTELEM block[16], int stride); \
2414 +void ff_vp8_idct_dc_add_ ## opt(uint8_t *dst, DCTELEM block[16], int stride); \
2415 +void ff_vp8_idct_dc_add4y_ ## opt(uint8_t *dst, DCTELEM block[4][16], int strid e); \
2416 +void ff_vp8_idct_dc_add4uv_ ## opt(uint8_t *dst, DCTELEM block[4][16], int stri de)
2417 +
2418 +idct_funcs(neon);
2419 +idct_funcs(armv6);
2420
2421 void ff_vp8_v_loop_filter16_neon(uint8_t *dst, int stride,
2422 int flim_E, int flim_I, int hev_thresh);
2423 @@ -47,29 +51,106 @@ void ff_vp8_h_loop_filter8uv_inner_neon(uint8_t *dstU, uint 8_t *dstV,
2424 int stride, int flim_E, int flim_I,
2425 int hev_thresh);
2426
2427 -void ff_vp8_v_loop_filter16_simple_neon(uint8_t *dst, int stride, int flim);
2428 -void ff_vp8_h_loop_filter16_simple_neon(uint8_t *dst, int stride, int flim);
2429 +void ff_vp8_v_loop_filter_inner_armv6(uint8_t *dst, int stride,
2430 + int flim_E, int flim_I,
2431 + int hev_thresh, int count);
2432 +void ff_vp8_h_loop_filter_inner_armv6(uint8_t *dst, int stride,
2433 + int flim_E, int flim_I,
2434 + int hev_thresh, int count);
2435 +void ff_vp8_v_loop_filter_armv6(uint8_t *dst, int stride,
2436 + int flim_E, int flim_I,
2437 + int hev_thresh, int count);
2438 +void ff_vp8_h_loop_filter_armv6(uint8_t *dst, int stride,
2439 + int flim_E, int flim_I,
2440 + int hev_thresh, int count);
2441
2442 +static void ff_vp8_v_loop_filter16_armv6(uint8_t *dst, int stride,
2443 + int flim_E, int flim_I, int hev_thresh )
2444 +{
2445 + ff_vp8_v_loop_filter_armv6(dst, stride, flim_E, flim_I, hev_thresh, 4);
2446 +}
2447 +
2448 +static void ff_vp8_h_loop_filter16_armv6(uint8_t *dst, int stride,
2449 + int flim_E, int flim_I, int hev_thresh )
2450 +{
2451 + ff_vp8_h_loop_filter_armv6(dst, stride, flim_E, flim_I, hev_thresh, 4);
2452 +}
2453
2454 -#define VP8_MC(n) \
2455 - void ff_put_vp8_##n##_neon(uint8_t *dst, int dststride, \
2456 - uint8_t *src, int srcstride, \
2457 - int h, int x, int y)
2458 +static void ff_vp8_v_loop_filter8uv_armv6(uint8_t *dstU, uint8_t *dstV, int str ide,
2459 + int flim_E, int flim_I, int hev_thres h)
2460 +{
2461 + ff_vp8_v_loop_filter_armv6(dstU, stride, flim_E, flim_I, hev_thresh, 2);
2462 + ff_vp8_v_loop_filter_armv6(dstV, stride, flim_E, flim_I, hev_thresh, 2);
2463 +}
2464 +
2465 +static void ff_vp8_h_loop_filter8uv_armv6(uint8_t *dstU, uint8_t *dstV, int str ide,
2466 + int flim_E, int flim_I, int hev_thres h)
2467 +{
2468 + ff_vp8_h_loop_filter_armv6(dstU, stride, flim_E, flim_I, hev_thresh, 2);
2469 + ff_vp8_h_loop_filter_armv6(dstV, stride, flim_E, flim_I, hev_thresh, 2);
2470 +}
2471 +
2472 +static void ff_vp8_v_loop_filter16_inner_armv6(uint8_t *dst, int stride,
2473 + int flim_E, int flim_I, int hev_ thresh)
2474 +{
2475 + ff_vp8_v_loop_filter_inner_armv6(dst, stride, flim_E, flim_I, hev_thresh, 4 );
2476 +}
2477 +
2478 +static void ff_vp8_h_loop_filter16_inner_armv6(uint8_t *dst, int stride,
2479 + int flim_E, int flim_I, int hev_ thresh)
2480 +{
2481 + ff_vp8_h_loop_filter_inner_armv6(dst, stride, flim_E, flim_I, hev_thresh, 4 );
2482 +}
2483 +
2484 +static void ff_vp8_v_loop_filter8uv_inner_armv6(uint8_t *dstU, uint8_t *dstV,
2485 + int stride, int flim_E, int fli m_I,
2486 + int hev_thresh)
2487 +{
2488 + ff_vp8_v_loop_filter_inner_armv6(dstU, stride, flim_E, flim_I, hev_thresh, 2);
2489 + ff_vp8_v_loop_filter_inner_armv6(dstV, stride, flim_E, flim_I, hev_thresh, 2);
2490 +}
2491 +
2492 +static void ff_vp8_h_loop_filter8uv_inner_armv6(uint8_t *dstU, uint8_t *dstV,
2493 + int stride, int flim_E, int fli m_I,
2494 + int hev_thresh)
2495 +{
2496 + ff_vp8_h_loop_filter_inner_armv6(dstU, stride, flim_E, flim_I, hev_thresh, 2);
2497 + ff_vp8_h_loop_filter_inner_armv6(dstV, stride, flim_E, flim_I, hev_thresh, 2);
2498 +}
2499 +
2500 +#define simple_lf_funcs(opt) \
2501 +void ff_vp8_v_loop_filter16_simple_ ## opt(uint8_t *dst, int stride, int flim); \
2502 +void ff_vp8_h_loop_filter16_simple_ ## opt(uint8_t *dst, int stride, int flim)
2503 +
2504 +simple_lf_funcs(neon);
2505 +simple_lf_funcs(armv6);
2506 +
2507 +#define VP8_MC_OPT(n, opt) \
2508 + void ff_put_vp8_##n##_##opt(uint8_t *dst, int dststride, \
2509 + uint8_t *src, int srcstride, \
2510 + int h, int x, int y)
2511 +
2512 +#define VP8_MC(n) \
2513 + VP8_MC_OPT(n, neon)
2514
2515 #define VP8_EPEL(w) \
2516 - VP8_MC(pixels ## w); \
2517 VP8_MC(epel ## w ## _h4); \
2518 VP8_MC(epel ## w ## _h6); \
2519 - VP8_MC(epel ## w ## _v4); \
2520 VP8_MC(epel ## w ## _h4v4); \
2521 VP8_MC(epel ## w ## _h6v4); \
2522 + VP8_MC(epel ## w ## _v4); \
2523 VP8_MC(epel ## w ## _v6); \
2524 VP8_MC(epel ## w ## _h4v6); \
2525 VP8_MC(epel ## w ## _h6v6)
2526
2527 VP8_EPEL(16);
2528 +VP8_MC(pixels16);
2529 +VP8_MC_OPT(pixels16, armv6);
2530 VP8_EPEL(8);
2531 +VP8_MC(pixels8);
2532 +VP8_MC_OPT(pixels8, armv6);
2533 VP8_EPEL(4);
2534 +VP8_MC_OPT(pixels4, armv6);
2535
2536 VP8_MC(bilin16_h);
2537 VP8_MC(bilin16_v);
2538 @@ -81,83 +162,148 @@ VP8_MC(bilin4_h);
2539 VP8_MC(bilin4_v);
2540 VP8_MC(bilin4_hv);
2541
2542 +#define VP8_V6_MC(n) \
2543 +void ff_put_vp8_##n##_armv6(uint8_t *dst, int dststride, uint8_t *src, \
2544 + int srcstride, int w, int h, int mxy)
2545 +
2546 +VP8_V6_MC(epel_v6);
2547 +VP8_V6_MC(epel_h6);
2548 +VP8_V6_MC(epel_v4);
2549 +VP8_V6_MC(epel_h4);
2550 +VP8_V6_MC(bilin_v);
2551 +VP8_V6_MC(bilin_h);
2552 +
2553 +#define VP8_EPEL_HV(SIZE, TAPNUMX, TAPNUMY, NAME, HNAME, VNAME, MAXHEIGHT) \
2554 +static void ff_put_vp8_##NAME##SIZE##_##HNAME##VNAME##_armv6( \
2555 + uint8_t *dst, int dststride, uint8_t *s rc, \
2556 + int srcstride, int h, int mx, int my) \
2557 +{ \
2558 + DECLARE_ALIGNED(4, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
2559 + uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
2560 + src -= srcstride * (TAPNUMY / 2 - 1); \
2561 + ff_put_vp8_ ## NAME ## _ ## HNAME ## _armv6(tmp, SIZE, src, srcstri de, \
2562 + SIZE, h + TAPNUMY - 1, mx); \
2563 + ff_put_vp8_ ## NAME ## _ ## VNAME ## _armv6(dst, dststride, tmpptr, SIZE, \
2564 + SIZE, h, my); \
2565 +}
2566 +
2567 +VP8_EPEL_HV(16, 6, 6, epel, h6, v6, 16);
2568 +VP8_EPEL_HV(16, 2, 2, bilin, h, v, 16);
2569 +VP8_EPEL_HV(8, 6, 6, epel, h6, v6, 16);
2570 +VP8_EPEL_HV(8, 4, 6, epel, h4, v6, 16);
2571 +VP8_EPEL_HV(8, 6, 4, epel, h6, v4, 16);
2572 +VP8_EPEL_HV(8, 4, 4, epel, h4, v4, 16);
2573 +VP8_EPEL_HV(8, 2, 2, bilin, h, v, 16);
2574 +VP8_EPEL_HV(4, 6, 6, epel, h6, v6, 8);
2575 +VP8_EPEL_HV(4, 4, 6, epel, h4, v6, 8);
2576 +VP8_EPEL_HV(4, 6, 4, epel, h6, v4, 8);
2577 +VP8_EPEL_HV(4, 4, 4, epel, h4, v4, 8);
2578 +VP8_EPEL_HV(4, 2, 2, bilin, h, v, 8);
2579 +
2580 +extern void put_vp8_epel4_v6_c(uint8_t *dst, int d, uint8_t *src, int s, int h, int mx, int my);
2581 +#undef printf
2582 +#define VP8_EPEL_H_OR_V(SIZE, NAME, HV) \
2583 +static void ff_put_vp8_##NAME##SIZE##_##HV##_armv6( \
2584 + uint8_t *dst, int dststride, uint8_t *s rc, \
2585 + int srcstride, int h, int mx, int my) \
2586 +{ \
2587 + ff_put_vp8_## NAME ## _ ## HV ## _armv6(dst, dststride, src, srcstride, \
2588 + SIZE, h, mx | my); \
2589 +}
2590 +
2591 +VP8_EPEL_H_OR_V(4, epel, h6);
2592 +VP8_EPEL_H_OR_V(4, epel, h4);
2593 +VP8_EPEL_H_OR_V(4, epel, v6);
2594 +VP8_EPEL_H_OR_V(4, epel, v4);
2595 +VP8_EPEL_H_OR_V(4, bilin, v);
2596 +VP8_EPEL_H_OR_V(4, bilin, h);
2597 +VP8_EPEL_H_OR_V(8, epel, h6);
2598 +VP8_EPEL_H_OR_V(8, epel, h4);
2599 +VP8_EPEL_H_OR_V(8, epel, v6);
2600 +VP8_EPEL_H_OR_V(8, epel, v4);
2601 +VP8_EPEL_H_OR_V(8, bilin, v);
2602 +VP8_EPEL_H_OR_V(8, bilin, h);
2603 +VP8_EPEL_H_OR_V(16, epel, h6);
2604 +VP8_EPEL_H_OR_V(16, epel, v6);
2605 +VP8_EPEL_H_OR_V(16, bilin, v);
2606 +VP8_EPEL_H_OR_V(16, bilin, h);
2607 +
2608 av_cold void ff_vp8dsp_init_arm(VP8DSPContext *dsp)
2609 {
2610 +#define set_func_ptrs(opt) \
2611 + dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_##opt; \
2612 + dsp->vp8_luma_dc_wht_dc = ff_vp8_luma_dc_wht_dc_armv6; \
2613 + \
2614 + dsp->vp8_idct_add = ff_vp8_idct_add_##opt; \
2615 + dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_##opt; \
2616 + dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_##opt; \
2617 + dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_##opt; \
2618 + \
2619 + dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_##opt; \
2620 + dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_##opt; \
2621 + dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_##opt; \
2622 + dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_##opt; \
2623 + \
2624 + dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_##opt; \
2625 + dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_##opt; \
2626 + dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_##opt; \
2627 + dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_##opt; \
2628 + \
2629 + dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_##opt; \
2630 + dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_##opt; \
2631 + \
2632 + dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_##opt; \
2633 + dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_##opt; \
2634 + dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_##opt; \
2635 + dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_##opt; \
2636 + \
2637 + dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_##opt; \
2638 + dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_##opt; \
2639 + dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_##opt; \
2640 + dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_##opt; \
2641 + dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_##opt; \
2642 + dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_##opt; \
2643 + dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_##opt; \
2644 + dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_##opt; \
2645 + dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_##opt; \
2646 + \
2647 + dsp->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_armv6; \
2648 + dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_##opt; \
2649 + dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_##opt; \
2650 + dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_##opt; \
2651 + dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_##opt; \
2652 + dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_##opt; \
2653 + dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_##opt; \
2654 + dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_##opt; \
2655 + dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_##opt; \
2656 + \
2657 + dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_##opt; \
2658 + dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_##opt; \
2659 + dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_##opt; \
2660 + dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_##opt ; \
2661 + \
2662 + dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_##opt; \
2663 + dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_##opt; \
2664 + dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_##opt; \
2665 + dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_##opt; \
2666 + dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_##opt; \
2667 + dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_##opt; \
2668 + dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_##opt; \
2669 + dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_##opt; \
2670 + dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_##opt; \
2671 + \
2672 + dsp->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_armv6; \
2673 + dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_##opt; \
2674 + dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_##opt; \
2675 + dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_##opt; \
2676 + dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_##opt; \
2677 + dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_##opt; \
2678 + dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_##opt; \
2679 + dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_##opt; \
2680 + dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_##opt
2681 if (HAVE_NEON) {
2682 - dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_neon;
2683 - dsp->vp8_luma_dc_wht_dc = ff_vp8_luma_dc_wht_dc_neon;
2684 -
2685 - dsp->vp8_idct_add = ff_vp8_idct_add_neon;
2686 - dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon;
2687 - dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon;
2688 - dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon;
2689 -
2690 - dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon;
2691 - dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon;
2692 - dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon;
2693 - dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon;
2694 -
2695 - dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_neon;
2696 - dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_neon;
2697 - dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_neon;
2698 - dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_neon;
2699 -
2700 - dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_neon;
2701 - dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_neon;
2702 -
2703 - dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
2704 - dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_neon;
2705 - dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_neon;
2706 - dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon;
2707 -
2708 - dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
2709 - dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_neon;
2710 - dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_neon;
2711 - dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_neon;
2712 - dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon;
2713 - dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon;
2714 - dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_neon;
2715 - dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon;
2716 - dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon;
2717 -
2718 - dsp->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_neon;
2719 - dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_neon;
2720 - dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_neon;
2721 - dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_neon;
2722 - dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_neon;
2723 - dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_neon;
2724 - dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon;
2725 - dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon;
2726 - dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon;
2727 -
2728 - dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
2729 - dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_neon;
2730 - dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_neon;
2731 - dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_neon;
2732 - dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_neon;
2733 - dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_neon;
2734 - dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_neon;
2735 - dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_neon;
2736 - dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_neon;
2737 -
2738 - dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
2739 - dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_neon;
2740 - dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_neon;
2741 - dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_neon;
2742 - dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_neon;
2743 - dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_neon;
2744 - dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_neon;
2745 - dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_neon;
2746 - dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_neon;
2747 -
2748 - dsp->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_neon;
2749 - dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_neon;
2750 - dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_neon;
2751 - dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_neon;
2752 - dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_neon;
2753 - dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_neon;
2754 - dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_neon;
2755 - dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_neon;
2756 - dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_neon;
2757 + set_func_ptrs(neon);
2758 + } else if (HAVE_ARMV6) {
2759 + set_func_ptrs(armv6);
2760 }
2761 }
2762 diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S
2763 index 1b9f24e..8e79982 100644
2764 --- a/libavcodec/arm/vp8dsp_neon.S
2765 +++ b/libavcodec/arm/vp8dsp_neon.S
2766 @@ -76,18 +76,6 @@ function ff_vp8_luma_dc_wht_neon, export=1
2767 bx lr
2768 endfunc
2769
2770 -function ff_vp8_luma_dc_wht_dc_neon, export=1
2771 - ldrsh r2, [r1]
2772 - mov r3, #0
2773 - add r2, r2, #3
2774 - strh r3, [r1]
2775 - asr r2, r2, #3
2776 - .rept 16
2777 - strh r2, [r0], #32
2778 - .endr
2779 - bx lr
2780 -endfunc
2781 -
2782 function ff_vp8_idct_add_neon, export=1
2783 vld1.16 {q0-q1}, [r1,:128]
2784 movw r3, #20091
2785 @@ -741,23 +729,6 @@ function ff_put_vp8_pixels8_neon, export=1
2786 bx lr
2787 endfunc
2788
2789 -function ff_put_vp8_pixels4_neon, export=1
2790 - ldr r12, [sp, #0] @ h
2791 - push {r4-r6,lr}
2792 -1:
2793 - subs r12, r12, #4
2794 - ldr_post r4, r2, r3
2795 - ldr_post r5, r2, r3
2796 - ldr_post r6, r2, r3
2797 - ldr_post lr, r2, r3
2798 - str_post r4, r0, r1
2799 - str_post r5, r0, r1
2800 - str_post r6, r0, r1
2801 - str_post lr, r0, r1
2802 - bgt 1b
2803 - pop {r4-r6,pc}
2804 -endfunc
2805 -
2806 /* 4/6-tap 8th-pel MC */
2807
2808 .macro vp8_epel8_h6 d, a, b
2809 --
2810 1.7.5.4
2811
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698