| OLD | NEW |
| (Empty) | |
| 1 From f2a35674d0ab9fc1852c088482fd51bf12e5ed45 Mon Sep 17 00:00:00 2001 |
| 2 From: "Ronald S. Bultje" <rsbultje@gmail.com> |
| 3 Date: Wed, 24 Aug 2011 13:58:37 -0700 |
| 4 Subject: [PATCH] VP8: armv6 optimizations. |
| 5 |
| 6 From 52.503s (~40fps) to 27.973sec (~80fps) decoding of 480p sintel |
| 7 trailer, i.e. a ~2x speedup overall, on a Nexus S. |
| 8 --- |
| 9 libavcodec/arm/Makefile | 3 +- |
| 10 libavcodec/arm/asm.S | 12 + |
| 11 libavcodec/arm/vp8dsp_armv6.S | 2328 ++++++++++++++++++++++++++++++++++++++ |
| 12 libavcodec/arm/vp8dsp_init_arm.c | 324 ++++-- |
| 13 libavcodec/arm/vp8dsp_neon.S | 29 - |
| 14 5 files changed, 2577 insertions(+), 119 deletions(-) |
| 15 create mode 100644 libavcodec/arm/vp8dsp_armv6.S |
| 16 |
| 17 diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile |
| 18 index 3374f0e..cc5a2a7 100644 |
| 19 --- a/libavcodec/arm/Makefile |
| 20 +++ b/libavcodec/arm/Makefile |
| 21 @@ -11,7 +11,8 @@ ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fix
ed_armv6.o |
| 22 OBJS-$(CONFIG_VP5_DECODER) += arm/vp56dsp_init_arm.o |
| 23 OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_init_arm.o |
| 24 OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_arm.o |
| 25 -ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o |
| 26 +ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \ |
| 27 + arm/vp8dsp_armv6.o |
| 28 |
| 29 OBJS-$(CONFIG_H264DSP) += arm/h264dsp_init_arm.o |
| 30 OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o |
| 31 diff --git a/libavcodec/arm/asm.S b/libavcodec/arm/asm.S |
| 32 index a7d3ace..c398f37 100644 |
| 33 --- a/libavcodec/arm/asm.S |
| 34 +++ b/libavcodec/arm/asm.S |
| 35 @@ -97,6 +97,12 @@ T add \rn, \rn, \rm |
| 36 T ldr \rt, [\rn] |
| 37 .endm |
| 38 |
| 39 +.macro ldr_dpren rt, rn, rm:vararg |
| 40 +A ldr \rt, [\rn, -\rm] |
| 41 +T sub \rt, \rn, \rm |
| 42 +T ldr \rt, [\rt] |
| 43 +.endm |
| 44 + |
| 45 .macro ldr_post rt, rn, rm:vararg |
| 46 A ldr \rt, [\rn], \rm |
| 47 T ldr \rt, [\rn] |
| 48 @@ -133,6 +139,12 @@ T ldrh \rt, [\rn] |
| 49 T add \rn, \rn, \rm |
| 50 .endm |
| 51 |
| 52 +.macro ldrb_post rt, rn, rm |
| 53 +A ldrb \rt, [\rn], \rm |
| 54 +T ldrb \rt, [\rn] |
| 55 +T add \rn, \rn, \rm |
| 56 +.endm |
| 57 + |
| 58 .macro str_post rt, rn, rm:vararg |
| 59 A str \rt, [\rn], \rm |
| 60 T str \rt, [\rn] |
| 61 diff --git a/libavcodec/arm/vp8dsp_armv6.S b/libavcodec/arm/vp8dsp_armv6.S |
| 62 new file mode 100644 |
| 63 index 0000000..4e7b783 |
| 64 --- /dev/null |
| 65 +++ b/libavcodec/arm/vp8dsp_armv6.S |
| 66 @@ -0,0 +1,2328 @@ |
| 67 +/** |
| 68 + * VP8 ARMv6 optimisations |
| 69 + * |
| 70 + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. |
| 71 + * Copyright (c) 2010 Rob Clark <rob@ti.com> |
| 72 + * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> |
| 73 + * |
| 74 + * This file is part of Libav. |
| 75 + * |
| 76 + * Libav is free software; you can redistribute it and/or |
| 77 + * modify it under the terms of the GNU Lesser General Public |
| 78 + * License as published by the Free Software Foundation; either |
| 79 + * version 2.1 of the License, or (at your option) any later version. |
| 80 + * |
| 81 + * Libav is distributed in the hope that it will be useful, |
| 82 + * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 83 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 84 + * Lesser General Public License for more details. |
| 85 + * |
| 86 + * You should have received a copy of the GNU Lesser General Public |
| 87 + * License along with Libav; if not, write to the Free Software |
| 88 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 89 + * |
| 90 + * This code was partially ported from libvpx, which uses this license: |
| 91 + * |
| 92 + * Use of this source code is governed by a BSD-style license |
| 93 + * that can be found in the LICENSE file in the root of the source |
| 94 + * tree. An additional intellectual property rights grant can be found |
| 95 + * in the file PATENTS. All contributing project authors may |
| 96 + * be found in the AUTHORS file in the root of the source tree. |
| 97 + * |
| 98 + * (Note that the "LICENSE", "AUTHORS" and "PATENTS" files can be |
| 99 + * found in the libvpx source tree.) |
| 100 + */ |
| 101 + |
| 102 +#include "asm.S" |
| 103 + |
| 104 +@ idct |
| 105 + |
| 106 +@ void vp8_luma_dc_wht(DCTELEM block[4][4][16], DCTELEM dc[16]) |
| 107 +function ff_vp8_luma_dc_wht_armv6, export=1 |
| 108 + push {r4 - r10, lr} |
| 109 + |
| 110 + @ load dc[] and zero memory |
| 111 + mov r12, #0 |
| 112 + ldr r2, [r1] @ dc0[0,1] |
| 113 + ldr r3, [r1, #4] @ dc0[2,3] |
| 114 + ldr r4, [r1, #8] @ dc1[0,1] |
| 115 + ldr r5, [r1, #12] @ dc1[2,3] |
| 116 + ldr r6, [r1, #16] @ dc2[0,1] |
| 117 + ldr r7, [r1, #20] @ dc2[2,3] |
| 118 + ldr r8, [r1, #24] @ dc3[0,1] |
| 119 + ldr r9, [r1, #28] @ dc3[2,3] |
| 120 + str r12,[r1] |
| 121 + str r12,[r1, #4] |
| 122 + str r12,[r1, #8] |
| 123 + str r12,[r1, #12] |
| 124 + str r12,[r1, #16] |
| 125 + str r12,[r1, #20] |
| 126 + str r12,[r1, #24] |
| 127 + str r12,[r1, #28] |
| 128 + |
| 129 + @ loop1 |
| 130 + uadd16 r12, r2, r8 @ t0[0,1] |
| 131 + uadd16 r14, r3, r9 @ t0[2,3] |
| 132 + usub16 r2, r2, r8 @ t3[0,1] |
| 133 + usub16 r3, r3, r9 @ t3[2,3] |
| 134 + uadd16 r8, r4, r6 @ t1[0,1] |
| 135 + uadd16 r9, r5, r7 @ t1[2,3] |
| 136 + usub16 r4, r4, r6 @ t2[0,1] |
| 137 + usub16 r5, r5, r7 @ t2[2,3] |
| 138 + |
| 139 + uadd16 r6, r12, r8 @ dc0[0,1] |
| 140 + uadd16 r7, r14, r9 @ dc0[2,3] |
| 141 + usub16 r12, r12, r8 @ dc2[0,1] |
| 142 + usub16 r14, r14, r9 @ dc2[2,3] |
| 143 + uadd16 r8, r2, r4 @ dc1[0,1] |
| 144 + uadd16 r9, r3, r5 @ dc1[2,3] |
| 145 + usub16 r2, r2, r4 @ dc3[0,1] |
| 146 + usub16 r3, r3, r5 @ dc3[2,3] |
| 147 + |
| 148 + mov r1, #3 |
| 149 + orr r1, r1, #0x30000 @ 3 | 3 (round) |
| 150 + |
| 151 + @ "transpose" |
| 152 + pkhbt r4, r6, r8, lsl #16 @ dc{0,1}[0] |
| 153 + pkhtb r6, r8, r6, asr #16 @ dc{0,1}[1] |
| 154 + pkhbt r5, r12, r2, lsl #16 @ dc{2,3}[0] |
| 155 + pkhtb r12, r2, r12, asr #16 @ dc{2,3}[1] |
| 156 + pkhbt r8, r7, r9, lsl #16 @ dc{0,1}[2] |
| 157 + uadd16 r4, r4, r1 |
| 158 + uadd16 r5, r5, r1 |
| 159 + pkhtb r7, r9, r7, asr #16 @ dc{0,1}[3] |
| 160 + pkhbt r2, r14, r3, lsl #16 @ dc{2,3}[2] |
| 161 + pkhtb r14, r3, r14, asr #16 @ dc{2,3}[3] |
| 162 + |
| 163 + @ loop2 |
| 164 + uadd16 r9, r4, r7 @ t0[0,1] |
| 165 + uadd16 r3, r5, r14 @ t0[2,3] |
| 166 + usub16 r4, r4, r7 @ t3[0,1] |
| 167 + usub16 r5, r5, r14 @ t3[2,3] |
| 168 + uadd16 r7, r6, r8 @ t1[0,1] |
| 169 + uadd16 r14, r12, r2 @ t1[2,3] |
| 170 + usub16 r6, r6, r8 @ t2[0,1] |
| 171 + usub16 r12, r12, r2 @ t2[2,3] |
| 172 + |
| 173 + uadd16 r8, r9, r7 @ block[0,1][0] |
| 174 + uadd16 r2, r3, r14 @ block[2,3][0] |
| 175 + usub16 r9, r9, r7 @ block[0,1][2] |
| 176 + usub16 r3, r3, r14 @ block[2,3][2] |
| 177 + uadd16 r7, r4, r6 @ block[0,1][1] |
| 178 + uadd16 r14, r5, r12 @ block[2,3][1] |
| 179 + usub16 r4, r4, r6 @ block[0,1][3] |
| 180 + usub16 r5, r5, r12 @ block[2,3][3] |
| 181 + |
| 182 + @ store |
| 183 + mov r6, r8, asr #19 @ block[1][0] |
| 184 + mov r12, r7, asr #19 @ block[1][1] |
| 185 + mov r1, r9, asr #19 @ block[1][2] |
| 186 + mov r10, r4, asr #19 @ block[1][3] |
| 187 + sxth r8, r8 |
| 188 + sxth r7, r7 |
| 189 + sxth r9, r9 |
| 190 + sxth r4, r4 |
| 191 + asr r8, #3 @ block[0][0] |
| 192 + asr r7, #3 @ block[0][1] |
| 193 + asr r9, #3 @ block[0][2] |
| 194 + asr r4, #3 @ block[0][3] |
| 195 + |
| 196 + strh r8, [r0], #32 |
| 197 + strh r7, [r0], #32 |
| 198 + strh r9, [r0], #32 |
| 199 + strh r4, [r0], #32 |
| 200 + strh r6, [r0], #32 |
| 201 + strh r12,[r0], #32 |
| 202 + strh r1, [r0], #32 |
| 203 + strh r10,[r0], #32 |
| 204 + |
| 205 + mov r6, r2, asr #19 @ block[3][0] |
| 206 + mov r12, r14, asr #19 @ block[3][1] |
| 207 + mov r1, r3, asr #19 @ block[3][2] |
| 208 + mov r10, r5, asr #19 @ block[3][3] |
| 209 + sxth r2, r2 |
| 210 + sxth r14, r14 |
| 211 + sxth r3, r3 |
| 212 + sxth r5, r5 |
| 213 + asr r2, #3 @ block[2][0] |
| 214 + asr r14, #3 @ block[2][1] |
| 215 + asr r3, #3 @ block[2][2] |
| 216 + asr r5, #3 @ block[2][3] |
| 217 + |
| 218 + strh r2, [r0], #32 |
| 219 + strh r14,[r0], #32 |
| 220 + strh r3, [r0], #32 |
| 221 + strh r5, [r0], #32 |
| 222 + strh r6, [r0], #32 |
| 223 + strh r12,[r0], #32 |
| 224 + strh r1, [r0], #32 |
| 225 + strh r10,[r0], #32 |
| 226 + |
| 227 + pop {r4 - r10, pc} |
| 228 +endfunc |
| 229 + |
| 230 +@ void vp8_luma_dc_wht_dc(DCTELEM block[4][4][16], DCTELEM dc[16]) |
| 231 +function ff_vp8_luma_dc_wht_dc_armv6, export=1 |
| 232 + ldrsh r2, [r1] |
| 233 + mov r3, #0 |
| 234 + add r2, r2, #3 |
| 235 + strh r3, [r1] |
| 236 + asr r2, r2, #3 |
| 237 + .rept 16 |
| 238 + strh r2, [r0], #32 |
| 239 + .endr |
| 240 + bx lr |
| 241 +endfunc |
| 242 + |
| 243 +@ void vp8_idct_add(uint8_t *dst, DCTELEM block[16], int stride) |
| 244 +function ff_vp8_idct_add_armv6, export=1 |
| 245 + push {r4 - r11, lr} |
| 246 + sub sp, sp, #32 |
| 247 + |
| 248 + mov r3, #0x00004E00 @ cos |
| 249 + orr r3, r3, #0x0000007B @ cospi8sqrt2minus1 = 20091 |
| 250 + mov r4, #0x00008A00 @ sin |
| 251 + orr r4, r4, #0x0000008C @ sinpi8sqrt2 = 35468 |
| 252 + mov r5, #0x2 @ i=2 |
| 253 +1: |
| 254 + ldr r6, [r1, #8] @ i5 | i4 = block1[1] | block
1[0] |
| 255 + ldr r12,[r1, #24] @ i13 | i12 = block3[1] | block
3[0] |
| 256 + ldr r14,[r1, #16] @ i9 | i8 = block2[1] | block
2[0] |
| 257 + |
| 258 + smulwt r9, r3, r6 @ (ip[5] * cospi8sqrt2minus1) >
> 16 |
| 259 + smulwb r7, r3, r6 @ (ip[4] * cospi8sqrt2minus1) >
> 16 |
| 260 + smulwt r10, r4, r6 @ (ip[5] * sinpi8sqrt2) >> 16 |
| 261 + smulwb r8, r4, r6 @ (ip[4] * sinpi8sqrt2) >> 16 |
| 262 + pkhbt r7, r7, r9, lsl #16 @ 5c | 4c |
| 263 + smulwt r11, r3, r12 @ (ip[13] * cospi8sqrt2minus1)
>> 16 |
| 264 + pkhbt r8, r8, r10, lsl #16 @ 5s | 4s = t2 first ha
lf |
| 265 + uadd16 r6, r6, r7 @ 5c+5 | 4c+4 = t3 first ha
lf |
| 266 + smulwt r7, r4, r12 @ (ip[13] * sinpi8sqrt2) >> 16 |
| 267 + smulwb r9, r3, r12 @ (ip[12] * cospi8sqrt2minus1)
>> 16 |
| 268 + smulwb r10, r4, r12 @ (ip[12] * sinpi8sqrt2) >> 16 |
| 269 + |
| 270 + subs r5, r5, #1 @ i-- |
| 271 + pkhbt r9, r9, r11, lsl #16 @ 13c | 12c |
| 272 + ldr r11,[r1] @ i1 | i0 |
| 273 + pkhbt r10, r10, r7, lsl #16 @ 13s | 12s = t3 second h
alf |
| 274 + uadd16 r7, r12, r9 @ 13c+13 | 12c+12 = t2 second h
alf |
| 275 + usub16 r7, r8, r7 @ c = t2 |
| 276 + uadd16 r6, r6, r10 @ d = t3 |
| 277 + uadd16 r10, r11, r14 @ a = t0 |
| 278 + usub16 r8, r11, r14 @ b = t1 |
| 279 + uadd16 r9, r10, r6 @ a+d = tmp{0,1}[0] |
| 280 + usub16 r10, r10, r6 @ a-d = tmp{0,1}[3] |
| 281 + uadd16 r6, r8, r7 @ b+c = tmp{0,1}[1] |
| 282 + usub16 r7, r8, r7 @ b-c = tmp{0,1}[2] |
| 283 + mov r8, #0 |
| 284 + str r6, [sp, #8] @ o5 | o4 |
| 285 + str r7, [sp, #16] @ o9 | o8 |
| 286 + str r10,[sp, #24] @ o13 | o12 |
| 287 + str r9, [sp], #4 @ o1 | o0 |
| 288 + str r8, [r1, #24] |
| 289 + str r8, [r1, #16] |
| 290 + str r8, [r1, #8] |
| 291 + str r8, [r1], #4 |
| 292 + bne 1b |
| 293 + |
| 294 + mov r5, #0x2 @ i=2 |
| 295 + sub sp, sp, #8 |
| 296 +2: |
| 297 + ldr r6, [sp, #8] @ i5 | i4 = tmp{0,1}[1] |
| 298 + ldr r14,[sp, #4] @ i3 | i2 = tmp{2,3}[0] |
| 299 + ldr r12,[sp, #12] @ i7 | i6 = tmp{2,3}[1] |
| 300 + ldr r1, [sp], #16 @ i1 | i0 = tmp{0,1}[0] |
| 301 + smulwt r9, r3, r6 @ (ip[5] * cospi8sqrt2minus1) >
> 16 |
| 302 + smulwt r7, r3, r1 @ (ip[1] * cospi8sqrt2minus1) >
> 16 |
| 303 + smulwt r10, r4, r6 @ (ip[5] * sinpi8sqrt2) >> 16 |
| 304 + smulwt r8, r4, r1 @ (ip[1] * sinpi8sqrt2) >> 16 |
| 305 + pkhbt r11, r1, r6, lsl #16 @ i4 | i0 = t0/t1 first half |
| 306 + pkhbt r7, r7, r9, lsl #16 @ 5c | 1c |
| 307 + pkhbt r8, r8, r10, lsl #16 @ 5s | 1s = temp1 = t2 first ha
lf |
| 308 + pkhtb r1, r6, r1, asr #16 @ i5 | i1 |
| 309 + uadd16 r1, r7, r1 @ 5c+5 | 1c+1 = temp2 (d) = t3
first half |
| 310 + pkhbt r9, r14, r12, lsl #16 @ i6 | i2 = t0/t1 second half |
| 311 + uadd16 r10, r11, r9 @ a = t0 |
| 312 + usub16 r9, r11, r9 @ b = t1 |
| 313 + pkhtb r6, r12, r14, asr #16 @ i7 | i3 |
| 314 + subs r5, r5, #0x1 @ i-- |
| 315 + smulwt r7, r3, r6 @ (ip[7] * cospi8sqrt2minus1) >
> 16 |
| 316 + smulwt r11, r4, r6 @ (ip[7] * sinpi8sqrt2) >> 16 |
| 317 + smulwb r12, r3, r6 @ (ip[3] * cospi8sqrt2minus1) >
> 16 |
| 318 + smulwb r14, r4, r6 @ (ip[3] * sinpi8sqrt2) >> 16 |
| 319 + |
| 320 + pkhbt r7, r12, r7, lsl #16 @ 7c | 3c |
| 321 + pkhbt r11, r14, r11, lsl #16 @ 7s | 3s = temp1 (d) = t3 seco
nd half |
| 322 + mov r14, #0x4 @ set up 4's |
| 323 + orr r14, r14, #0x40000 @ 4|4 |
| 324 + uadd16 r6, r7, r6 @ 7c+7 | 3c+3 = temp2 (c) = t2
second half |
| 325 + usub16 r12, r8, r6 @ c (o5 | o1) = t2 |
| 326 + uadd16 r6, r11, r1 @ d (o7 | o3) = t3 |
| 327 + uadd16 r10, r10, r14 @ t0 + 4 |
| 328 + uadd16 r9, r9, r14 @ t1 + 4 |
| 329 + uadd16 r7, r10, r6 @ a+d = dst{0,1}[0] |
| 330 + usub16 r6, r10, r6 @ a-d = dst{0,1}[3] |
| 331 + uadd16 r10, r9, r12 @ b+c = dst{0,1}[1] |
| 332 + usub16 r1, r9, r12 @ b-c = dst{0,1}[2] |
| 333 + |
| 334 + mov r9, r6, asr #3 @ o[1][3] |
| 335 + mov r12, r1, asr #3 @ o[1][2] |
| 336 + pkhtb r8, r12, r7, asr #19 @ o[1][0,2] |
| 337 + pkhtb r11, r9, r10, asr #19 @ o[1][1,3] |
| 338 + ldr r12,[r0] |
| 339 + ldr r9, [r0, r2] |
| 340 + sxth r7, r7 |
| 341 + sxth r6, r6 |
| 342 + sxth r10, r10 |
| 343 + sxth r1, r1 |
| 344 + asr r7, #3 @ o[0][0] |
| 345 + asr r10, #3 @ o[0][1] |
| 346 + pkhbt r7, r7, r1, lsl #13 @ o[0][0,2] |
| 347 + pkhbt r10, r10, r6, lsl #13 @ o[0][1,3] |
| 348 + |
| 349 + uxtab16 r7, r7, r12 |
| 350 + uxtab16 r10, r10, r12, ror #8 |
| 351 + uxtab16 r8, r8, r9 |
| 352 + uxtab16 r11, r11, r9, ror #8 |
| 353 + usat16 r7, #8, r7 |
| 354 + usat16 r10, #8, r10 |
| 355 + usat16 r8, #8, r8 |
| 356 + usat16 r11, #8, r11 |
| 357 + orr r7, r7, r10, lsl #8 |
| 358 + orr r8, r8, r11, lsl #8 |
| 359 + str r8, [r0, r2] |
| 360 + str_post r7, r0, r2, lsl #1 |
| 361 + |
| 362 + bne 2b |
| 363 + |
| 364 + pop {r4 - r11, pc} |
| 365 +endfunc |
| 366 + |
| 367 +@ void vp8_idct_dc_add(uint8_t *dst, DCTELEM block[16], int stride) |
| 368 +function ff_vp8_idct_dc_add_armv6, export=1 |
| 369 + push {r4 - r5, lr} |
| 370 + ldrsh r3, [r1] |
| 371 + mov r4, #0 |
| 372 + add r3, r3, #4 |
| 373 + asr r3, #3 |
| 374 + strh r4, [r1], #32 |
| 375 + ldr r4, [r0, r2] |
| 376 + ldr_post r5, r0, r2, lsl #1 |
| 377 + pkhbt r3, r3, r3, lsl #16 |
| 378 + |
| 379 + uxtab16 lr, r3, r5 @ a1+2 | a1+0 |
| 380 + uxtab16 r5, r3, r5, ror #8 @ a1+3 | a1+1 |
| 381 + uxtab16 r12, r3, r4 |
| 382 + uxtab16 r4, r3, r4, ror #8 |
| 383 + usat16 lr, #8, lr |
| 384 + usat16 r5, #8, r5 |
| 385 + usat16 r12, #8, r12 |
| 386 + usat16 r4, #8, r4 |
| 387 + orr lr, lr, r5, lsl #8 |
| 388 + orr r12, r12, r4, lsl #8 |
| 389 + ldr r5, [r0] |
| 390 + ldr r4, [r0, r2] |
| 391 + sub r0, r0, r2, lsl #1 |
| 392 + str r12,[r0, r2] |
| 393 + str_post lr, r0, r2, lsl #1 |
| 394 + |
| 395 + uxtab16 lr, r3, r5 |
| 396 + uxtab16 r5, r3, r5, ror #8 |
| 397 + uxtab16 r12, r3, r4 |
| 398 + uxtab16 r4, r3, r4, ror #8 |
| 399 + usat16 lr, #8, lr |
| 400 + usat16 r5, #8, r5 |
| 401 + usat16 r12, #8, r12 |
| 402 + usat16 r4, #8, r4 |
| 403 + orr lr, lr, r5, lsl #8 |
| 404 + orr r12, r12, r4, lsl #8 |
| 405 + |
| 406 + str r12,[r0, r2] |
| 407 + str_post lr, r0, r2, lsl #1 |
| 408 + |
| 409 + pop {r4 - r5, pc} |
| 410 +endfunc |
| 411 + |
| 412 +@ void vp8_idct_dc_add4uv(uint8_t *dst, DCTELEM block[4][16], int stride) |
| 413 +function ff_vp8_idct_dc_add4uv_armv6, export=1 |
| 414 + push {lr} |
| 415 + |
| 416 + bl ff_vp8_idct_dc_add_armv6 |
| 417 + sub r0, r0, r2, lsl #2 |
| 418 + add r0, r0, #4 |
| 419 + bl ff_vp8_idct_dc_add_armv6 |
| 420 + sub r0, r0, #4 |
| 421 + bl ff_vp8_idct_dc_add_armv6 |
| 422 + sub r0, r0, r2, lsl #2 |
| 423 + add r0, r0, #4 |
| 424 + bl ff_vp8_idct_dc_add_armv6 |
| 425 + |
| 426 + pop {pc} |
| 427 +endfunc |
| 428 + |
| 429 +@ void vp8_idct_dc_add4y(uint8_t *dst, DCTELEM block[4][16], int stride) |
| 430 +function ff_vp8_idct_dc_add4y_armv6, export=1 |
| 431 + push {lr} |
| 432 + |
| 433 + bl ff_vp8_idct_dc_add_armv6 |
| 434 + sub r0, r0, r2, lsl #2 |
| 435 + add r0, r0, #4 |
| 436 + bl ff_vp8_idct_dc_add_armv6 |
| 437 + sub r0, r0, r2, lsl #2 |
| 438 + add r0, r0, #4 |
| 439 + bl ff_vp8_idct_dc_add_armv6 |
| 440 + sub r0, r0, r2, lsl #2 |
| 441 + add r0, r0, #4 |
| 442 + bl ff_vp8_idct_dc_add_armv6 |
| 443 + |
| 444 + pop {pc} |
| 445 +endfunc |
| 446 + |
| 447 +@ loopfilter |
| 448 + |
| 449 +@ void vp8_v_loop_filter16_simple(uint8_t *dst, int stride, int flim) |
| 450 +function ff_vp8_v_loop_filter16_simple_armv6, export=1 |
| 451 + push {r4 - r11, lr} |
| 452 + |
| 453 + ldr_dpren r3, r0, r1, lsl #1 @ p1 |
| 454 + ldr_dpren r4, r0, r1 @ p0 |
| 455 + ldr r5, [r0] @ q0 |
| 456 + ldr r6, [r0, r1] @ q1 |
| 457 + orr r2, r2, r2, lsl #16 |
| 458 + mov r9, #4 @ count |
| 459 + mov lr, #0 @ need 0 in a couple places |
| 460 + orr r12, r2, r2, lsl #8 @ splat int -> byte |
| 461 + ldr r2, c0x80808080 |
| 462 + |
| 463 +1: |
| 464 + @ vp8_simple_filter_mask() |
| 465 + uqsub8 r7, r3, r6 @ p1 - q1 |
| 466 + uqsub8 r8, r6, r3 @ q1 - p1 |
| 467 + uqsub8 r10, r4, r5 @ p0 - q0 |
| 468 + uqsub8 r11, r5, r4 @ q0 - p0 |
| 469 + orr r8, r8, r7 @ abs(p1 - q1) |
| 470 + orr r10, r10, r11 @ abs(p0 - q0) |
| 471 + uqadd8 r10, r10, r10 @ abs(p0 - q0) * 2 |
| 472 + uhadd8 r8, r8, lr @ abs(p1 - q2) >> 1 |
| 473 + uqadd8 r10, r10, r8 @ abs(p0 - q0)*2 + abs(p1 - q1)
/2 |
| 474 + mvn r8, #0 |
| 475 + usub8 r10, r12, r10 @ compare to flimit. usub8 sets
GE flags |
| 476 + sel r10, r8, lr @ filter mask: F or 0 |
| 477 + cmp r10, #0 |
| 478 + beq 2f @ skip filtering if all masks a
re 0x00 |
| 479 + |
| 480 + @ vp8_simple_filter() |
| 481 + eor r3, r3, r2 @ p1 offset to convert to a sig
ned value |
| 482 + eor r6, r6, r2 @ q1 offset to convert to a sig
ned value |
| 483 + eor r4, r4, r2 @ p0 offset to convert to a sig
ned value |
| 484 + eor r5, r5, r2 @ q0 offset to convert to a sig
ned value |
| 485 + |
| 486 + qsub8 r3, r3, r6 @ vp8_filter = p1 - q1 |
| 487 + qsub8 r6, r5, r4 @ q0 - p0 |
| 488 + qadd8 r3, r3, r6 @ += q0 - p0 |
| 489 + ldr r7, c0x04040404 |
| 490 + qadd8 r3, r3, r6 @ += q0 - p0 |
| 491 + ldr r8, c0x03030303 |
| 492 + qadd8 r3, r3, r6 @ vp8_filter = p1-q1 + 3*(q0-p0
)) |
| 493 + @STALL |
| 494 + and r3, r3, r10 @ vp8_filter &= mask |
| 495 + |
| 496 + qadd8 r7, r3, r7 @ Filter1 = vp8_filter + 4 |
| 497 + qadd8 r8, r3, r8 @ Filter2 = vp8_filter + 3 |
| 498 + |
| 499 + shadd8 r7, r7, lr |
| 500 + shadd8 r8, r8, lr |
| 501 + shadd8 r7, r7, lr |
| 502 + shadd8 r8, r8, lr |
| 503 + shadd8 r7, r7, lr @ Filter1 >>= 3 |
| 504 + shadd8 r8, r8, lr @ Filter2 >>= 3 |
| 505 + |
| 506 + qsub8 r5, r5, r7 @ u = q0 - Filter1 |
| 507 + qadd8 r4, r4, r8 @ u = p0 + Filter2 |
| 508 + eor r5, r5, r2 @ *oq0 = u^0x80 |
| 509 + eor r4, r4, r2 @ *op0 = u^0x80 |
| 510 +T sub r7, r0, r1 |
| 511 + str r5, [r0] @ store oq0 result |
| 512 +A str r4, [r0, -r1] @ store op0 result |
| 513 +T str r4, [r7] |
| 514 + |
| 515 +2: |
| 516 + subs r9, r9, #1 @ counter-- |
| 517 + add r0, r0, #4 @ next row |
| 518 +T itttt ne |
| 519 +A ldrne r3, [r0, -r1, lsl #1] @ p1 |
| 520 +T subne r3, r0, r1, lsl #1 |
| 521 +T ldrne r3, [r3] @ p1 |
| 522 +A ldrne r4, [r0, -r1] @ p0 |
| 523 +T subne r4, r0, r1 |
| 524 +T ldrne r4, [r4] @ p0 |
| 525 +T itt ne |
| 526 + ldrne r5, [r0] @ q0 |
| 527 + ldrne r6, [r0, r1] @ q1 |
| 528 + |
| 529 + bne 1b |
| 530 + |
| 531 + pop {r4 - r11, pc} |
| 532 +endfunc |
| 533 + |
| 534 +c0x01010101: .long 0x01010101 |
| 535 +c0x03030303: .long 0x03030303 |
| 536 +c0x04040404: .long 0x04040404 |
| 537 +c0x7F7F7F7F: .long 0x7F7F7F7F |
| 538 +c0x80808080: .long 0x80808080 |
| 539 + |
| 540 +@ void vp8_v_loop_filter16_inner(uint8_t *dst, int stride, |
| 541 +@ int fE, int fI, int hev_thresh) |
| 542 +@ and |
| 543 +@ void vp8_v_loop_filter8uv_inner(uint8_t *dstU, uint8_t *dstV, int stride, |
| 544 +@ int fE, int fI, int hev_thresh) |
| 545 +@ call: |
| 546 +@ void vp8_v_loop_filter_inner(uint8_t *dst, int stride, |
| 547 +@ int fE, int fI, int hev_thresh, int count) |
| 548 +function ff_vp8_v_loop_filter_inner_armv6, export=1 |
| 549 + push {r4 - r11, lr} |
| 550 + |
| 551 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lin
es |
| 552 + ldr r5, [sp, #40] @ counter |
| 553 + ldr r6, [sp, #36] @ load thresh address |
| 554 + sub sp, sp, #16 @ create temp buffer |
| 555 + |
| 556 + ldr r10,[r0, r1] @ p2 |
| 557 + ldr_post r9, r0, r1, lsl #1 @ p3 |
| 558 + ldr r12,[r0, r1] @ p0 |
| 559 + ldr_post r11, r0, r1, lsl #1 @ p1 |
| 560 + |
| 561 + orr r2, r2, r2, lsl #16 |
| 562 + orr r3, r3, r3, lsl #16 |
| 563 + orr r6, r6, r6, lsl #16 |
| 564 + orr r4, r2, r2, lsl #8 @ flimE splat int -> byte |
| 565 + orr r2, r3, r3, lsl #8 @ flimI splat int -> byte |
| 566 + orr r3, r6, r6, lsl #8 @ thresh splat int -> byte |
| 567 + |
| 568 +1: |
| 569 + @ vp8_filter_mask() function |
| 570 + @ calculate breakout conditions |
| 571 + uqsub8 r6, r9, r10 @ p3 - p2 |
| 572 + uqsub8 r7, r10, r9 @ p2 - p3 |
| 573 + uqsub8 r8, r10, r11 @ p2 - p1 |
| 574 + uqsub8 r10, r11, r10 @ p1 - p2 |
| 575 + |
| 576 + orr r6, r6, r7 @ abs (p3-p2) |
| 577 + orr r8, r8, r10 @ abs (p2-p1) |
| 578 + uqsub8 lr, r6, r2 @ compare to limit. lr: vp8_fil
ter_mask |
| 579 + uqsub8 r8, r8, r2 @ compare to limit |
| 580 + uqsub8 r6, r11, r12 @ p1 - p0 |
| 581 + orr lr, lr, r8 |
| 582 + uqsub8 r7, r12, r11 @ p0 - p1 |
| 583 + ldr r10,[r0, r1] @ q1 |
| 584 + ldr_post r9, r0, r1, lsl #1 @ q0 |
| 585 + orr r6, r6, r7 @ abs (p1-p0) |
| 586 + uqsub8 r7, r6, r2 @ compare to limit |
| 587 + uqsub8 r8, r6, r3 @ compare to thresh -- save r8
for later |
| 588 + orr lr, lr, r7 |
| 589 + |
| 590 + uqsub8 r6, r11, r10 @ p1 - q1 |
| 591 + uqsub8 r7, r10, r11 @ q1 - p1 |
| 592 + uqsub8 r11, r12, r9 @ p0 - q0 |
| 593 + uqsub8 r12, r9, r12 @ q0 - p0 |
| 594 + orr r6, r6, r7 @ abs (p1-q1) |
| 595 + ldr r7, c0x7F7F7F7F |
| 596 + orr r12, r11, r12 @ abs (p0-q0) |
| 597 + ldr_post r11, r0, r1 @ q2 |
| 598 + uqadd8 r12, r12, r12 @ abs (p0-q0) * 2 |
| 599 + and r6, r7, r6, lsr #1 @ abs (p1-q1) / 2 |
| 600 + uqsub8 r7, r9, r10 @ q0 - q1 |
| 601 + uqadd8 r12, r12, r6 @ abs (p0-q0)*2 + abs (p1-q1)/2 |
| 602 + uqsub8 r6, r10, r9 @ q1 - q0 |
| 603 + uqsub8 r12, r12, r4 @ compare to flimit |
| 604 + uqsub8 r9, r11, r10 @ q2 - q1 |
| 605 + |
| 606 + orr lr, lr, r12 |
| 607 + |
| 608 + ldr_post r12, r0, r1 @ q3 |
| 609 + uqsub8 r10, r10, r11 @ q1 - q2 |
| 610 + orr r6, r7, r6 @ abs (q1-q0) |
| 611 + orr r10, r9, r10 @ abs (q2-q1) |
| 612 + uqsub8 r7, r6, r2 @ compare to limit |
| 613 + uqsub8 r10, r10, r2 @ compare to limit |
| 614 + uqsub8 r6, r6, r3 @ compare to thresh -- save r6
for later |
| 615 + orr lr, lr, r7 |
| 616 + orr lr, lr, r10 |
| 617 + |
| 618 + uqsub8 r10, r12, r11 @ q3 - q2 |
| 619 + uqsub8 r9, r11, r12 @ q2 - q3 |
| 620 + |
| 621 + mvn r11, #0 @ r11 == -1 |
| 622 + |
| 623 + orr r10, r10, r9 @ abs (q3-q2) |
| 624 + uqsub8 r10, r10, r2 @ compare to limit |
| 625 + |
| 626 + mov r12, #0 |
| 627 + orr lr, lr, r10 |
| 628 + sub r0, r0, r1, lsl #2 |
| 629 + |
| 630 + usub8 lr, r12, lr @ use usub8 instead of ssub8 |
| 631 + sel lr, r11, r12 @ filter mask: lr |
| 632 + |
| 633 + cmp lr, #0 |
| 634 + beq 2f @ skip filtering |
| 635 + |
| 636 + sub r0, r0, r1, lsl #1 @ move r0 pointer down by 6 lin
es |
| 637 + |
| 638 + @vp8_hevmask() function |
| 639 + @calculate high edge variance |
| 640 + orr r10, r6, r8 @ calculate vp8_hevmask |
| 641 + |
| 642 + usub8 r10, r12, r10 @ use usub8 instead of ssub8 |
| 643 + sel r6, r12, r11 @ obtain vp8_hevmask: r6 |
| 644 + |
| 645 + @vp8_filter() function |
| 646 + ldr r8, [r0, r1] @ p0 |
| 647 + ldr_post r7, r0, r1, lsl #1 @ p1 |
| 648 + ldr r12, c0x80808080 |
| 649 + ldr r10,[r0, r1] @ q1 |
| 650 + ldr_post r9, r0, r1, lsl #1 @ q0 |
| 651 + |
| 652 + eor r7, r7, r12 @ p1 offset to convert to a sig
ned value |
| 653 + eor r8, r8, r12 @ p0 offset to convert to a sig
ned value |
| 654 + eor r9, r9, r12 @ q0 offset to convert to a sig
ned value |
| 655 + eor r10, r10, r12 @ q1 offset to convert to a sig
ned value |
| 656 + |
| 657 + str r9, [sp] @ store qs0 temporarily |
| 658 + str r8, [sp, #4] @ store ps0 temporarily |
| 659 + str r10,[sp, #8] @ store qs1 temporarily |
| 660 + str r7, [sp, #12] @ store ps1 temporarily |
| 661 + |
| 662 + qsub8 r7, r7, r10 @ vp8_signed_char_clamp(ps1-qs1
) |
| 663 + qsub8 r8, r9, r8 @ vp8_signed_char_clamp(vp8_fil
ter + 3 * ( qs0 - ps0)) |
| 664 + |
| 665 + and r7, r7, r6 @ vp8_filter (r7) &= hev |
| 666 + |
| 667 + qadd8 r7, r7, r8 |
| 668 + ldr r9, c0x03030303 @ r9 = 3 --modified for vp8 |
| 669 + |
| 670 + qadd8 r7, r7, r8 |
| 671 + ldr r10, c0x04040404 |
| 672 + |
| 673 + qadd8 r7, r7, r8 |
| 674 + and r7, r7, lr @ vp8_filter &= mask@ |
| 675 + |
| 676 + qadd8 r8, r7, r9 @ Filter2 (r8) = vp8_signed_cha
r_clamp(vp8_filter+3) |
| 677 + qadd8 r7, r7, r10 @ vp8_filter = vp8_signed_char_
clamp(vp8_filter+4) |
| 678 + |
| 679 + mov r9, #0 |
| 680 + shadd8 r8, r8, r9 @ Filter2 >>= 3 |
| 681 + shadd8 r7, r7, r9 @ vp8_filter >>= 3 |
| 682 + shadd8 r8, r8, r9 |
| 683 + shadd8 r7, r7, r9 |
| 684 + shadd8 lr, r8, r9 @ lr: Filter2 |
| 685 + shadd8 r7, r7, r9 @ r7: filter |
| 686 + |
| 687 + @calculate output |
| 688 + |
| 689 + ldr r8, [sp] @ load qs0 |
| 690 + ldr r9, [sp, #4] @ load ps0 |
| 691 + |
| 692 + ldr r10, c0x01010101 |
| 693 + |
| 694 + qsub8 r8, r8, r7 @ u = vp8_signed_char_clamp(qs0
- vp8_filter) |
| 695 + qadd8 r9, r9, lr @ u = vp8_signed_char_clamp(ps0
+ Filter2) |
| 696 + |
| 697 + mov lr, #0 |
| 698 + sadd8 r7, r7, r10 @ vp8_filter += 1 |
| 699 + shadd8 r7, r7, lr @ vp8_filter >>= 1 |
| 700 + |
| 701 + ldr r11,[sp, #12] @ load ps1 |
| 702 + ldr r10,[sp, #8] @ load qs1 |
| 703 + |
| 704 + bic r7, r7, r6 @ vp8_filter &= ~hev |
| 705 + sub r0, r0, r1, lsl #2 |
| 706 + |
| 707 + qadd8 r11, r11, r7 @ u = vp8_signed_char_clamp(ps1
+ vp8_filter) |
| 708 + qsub8 r10, r10, r7 @ u = vp8_signed_char_clamp(qs1
- vp8_filter) |
| 709 + |
| 710 + eor r11, r11, r12 @ *op1 = u^0x80 |
| 711 + eor r9, r9, r12 @ *op0 = u^0x80 |
| 712 + eor r8, r8, r12 @ *oq0 = u^0x80 |
| 713 + eor r10, r10, r12 @ *oq1 = u^0x80 |
| 714 + str r9, [r0, r1] @ store op0 result |
| 715 + str_post r11, r0, r1, lsl #1 @ store op1 |
| 716 + str r10,[r0, r1] @ store oq1 |
| 717 + str_post r8, r0, r1, lsl #1 @ store oq0 result |
| 718 + |
| 719 + sub r0, r0, r1, lsl #1 |
| 720 + |
| 721 +2: |
| 722 + add r0, r0, #4 |
| 723 + sub r0, r0, r1, lsl #2 |
| 724 + |
| 725 + subs r5, r5, #1 |
| 726 +T ittt ne |
| 727 + ldrne r10,[r0, r1] @ p2 |
| 728 +A ldrne r9, [r0], r1, lsl #1 @ p3 |
| 729 +T ldrne r9, [r0] @ p3 |
| 730 +T addne r0, r0, r1, lsl #1 |
| 731 +T ittt ne |
| 732 + ldrne r12,[r0, r1] @ p0 |
| 733 +A ldrne r11,[r0], r1, lsl #1 @ p1 |
| 734 +T ldrne r11,[r0] @ p3 |
| 735 +T addne r0, r0, r1, lsl #1 |
| 736 + |
| 737 + bne 1b |
| 738 + |
| 739 + add sp, sp, #16 |
| 740 + pop {r4 - r11, pc} |
| 741 +endfunc |
| 742 + |
| 743 +@ void vp8_v_loop_filter16(uint8_t *dst, int stride, |
| 744 +@ int fE, int fI, int hev_thresh) |
| 745 +@ and |
| 746 +@ void vp8_v_loop_filter8uv(uint8_t *dstU, uint8_t *dstV, int stride, |
| 747 +@ int fE, int fI, int hev_thresh) |
| 748 +@ call: |
| 749 +@ void vp8_v_loop_filter(uint8_t *dst, int stride, |
| 750 +@ int fE, int fI, int hev_thresh, int count) |
| 751 +function ff_vp8_v_loop_filter_armv6, export=1 |
| 752 + push {r4 - r11, lr} |
| 753 + |
| 754 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lin
es |
| 755 + ldr r5, [sp, #40] @ counter |
| 756 + ldr r6, [sp, #36] @ load thresh address |
| 757 + sub sp, sp, #16 @ create temp buffer |
| 758 + |
| 759 + ldr r10,[r0, r1] @ p2 |
| 760 + ldr_post r9, r0, r1, lsl #1 @ p3 |
| 761 + ldr r12,[r0, r1] @ p0 |
| 762 + ldr_post r11, r0, r1, lsl #1 @ p1 |
| 763 + |
| 764 + orr r2, r2, r2, lsl #16 |
| 765 + orr r3, r3, r3, lsl #16 |
| 766 + orr r6, r6, r6, lsl #16 |
| 767 + orr r4, r2, r2, lsl #8 @ flimE splat int -> byte |
| 768 + orr r2, r3, r3, lsl #8 @ flimI splat int -> byte |
| 769 + orr r3, r6, r6, lsl #8 @ thresh splat int -> byte |
| 770 + |
| 771 +1: |
| 772 + @ vp8_filter_mask() function |
| 773 + @ calculate breakout conditions |
| 774 + uqsub8 r6, r9, r10 @ p3 - p2 |
| 775 + uqsub8 r7, r10, r9 @ p2 - p3 |
| 776 + uqsub8 r8, r10, r11 @ p2 - p1 |
| 777 + uqsub8 r10, r11, r10 @ p1 - p2 |
| 778 + |
| 779 + orr r6, r6, r7 @ abs (p3-p2) |
| 780 + orr r8, r8, r10 @ abs (p2-p1) |
| 781 + uqsub8 lr, r6, r2 @ compare to limit. lr: vp8_fil
ter_mask |
| 782 + uqsub8 r8, r8, r2 @ compare to limit |
| 783 + |
| 784 + uqsub8 r6, r11, r12 @ p1 - p0 |
| 785 + orr lr, lr, r8 |
| 786 + uqsub8 r7, r12, r11 @ p0 - p1 |
| 787 + ldr r10,[r0, r1] @ q1 |
| 788 + ldr_post r9, r0, r1, lsl #1 @ q0 |
| 789 + orr r6, r6, r7 @ abs (p1-p0) |
| 790 + uqsub8 r7, r6, r2 @ compare to limit |
| 791 + uqsub8 r8, r6, r3 @ compare to thresh -- save r8
for later |
| 792 + orr lr, lr, r7 |
| 793 + |
| 794 + uqsub8 r6, r11, r10 @ p1 - q1 |
| 795 + uqsub8 r7, r10, r11 @ q1 - p1 |
| 796 + uqsub8 r11, r12, r9 @ p0 - q0 |
| 797 + uqsub8 r12, r9, r12 @ q0 - p0 |
| 798 + orr r6, r6, r7 @ abs (p1-q1) |
| 799 + ldr r7, c0x7F7F7F7F |
| 800 + orr r12, r11, r12 @ abs (p0-q0) |
| 801 + ldr_post r11, r0, r1 @ q2 |
| 802 + uqadd8 r12, r12, r12 @ abs (p0-q0) * 2 |
| 803 + and r6, r7, r6, lsr #1 @ abs (p1-q1) / 2 |
| 804 + uqsub8 r7, r9, r10 @ q0 - q1 |
| 805 + uqadd8 r12, r12, r6 @ abs (p0-q0)*2 + abs (p1-q1)/2 |
| 806 + uqsub8 r6, r10, r9 @ q1 - q0 |
| 807 + uqsub8 r12, r12, r4 @ compare to flimit |
| 808 + uqsub8 r9, r11, r10 @ q2 - q1 |
| 809 + |
| 810 + orr lr, lr, r12 |
| 811 + |
| 812 + ldr_post r12, r0, r1 @ q3 |
| 813 + |
| 814 + uqsub8 r10, r10, r11 @ q1 - q2 |
| 815 + orr r6, r7, r6 @ abs (q1-q0) |
| 816 + orr r10, r9, r10 @ abs (q2-q1) |
| 817 + uqsub8 r7, r6, r2 @ compare to limit |
| 818 + uqsub8 r10, r10, r2 @ compare to limit |
| 819 + uqsub8 r6, r6, r3 @ compare to thresh -- save r6
for later |
| 820 + orr lr, lr, r7 |
| 821 + orr lr, lr, r10 |
| 822 + |
| 823 + uqsub8 r10, r12, r11 @ q3 - q2 |
| 824 + uqsub8 r9, r11, r12 @ q2 - q3 |
| 825 + |
| 826 + mvn r11, #0 @ r11 == -1 |
| 827 + |
| 828 + orr r10, r10, r9 @ abs (q3-q2) |
| 829 + uqsub8 r10, r10, r2 @ compare to limit |
| 830 + |
| 831 + mov r12, #0 |
| 832 + |
| 833 + orr lr, lr, r10 |
| 834 + |
| 835 + usub8 lr, r12, lr @ use usub8 instead of ssub8 |
| 836 + sel lr, r11, r12 @ filter mask: lr |
| 837 + |
| 838 + cmp lr, #0 |
| 839 + beq 2f @ skip filtering |
| 840 + |
| 841 + @vp8_hevmask() function |
| 842 + @calculate high edge variance |
| 843 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 6 lin
es |
| 844 + sub r0, r0, r1, lsl #1 |
| 845 + |
| 846 + orr r10, r6, r8 |
| 847 + |
| 848 + usub8 r10, r12, r10 |
| 849 + sel r6, r12, r11 @ hev mask: r6 |
| 850 + |
| 851 + @vp8_mbfilter() function |
| 852 + @p2, q2 are only needed at the end. Do not need to load them in now. |
| 853 + ldr r8, [r0, r1] @ p0 |
| 854 + ldr_post r7, r0, r1, lsl #1 @ p1 |
| 855 + ldr r12, c0x80808080 |
| 856 + ldr_post r9, r0, r1 @ q0 |
| 857 + ldr r10,[r0] @ q1 |
| 858 + |
| 859 + eor r7, r7, r12 @ ps1 |
| 860 + eor r8, r8, r12 @ ps0 |
| 861 + eor r9, r9, r12 @ qs0 |
| 862 + eor r10, r10, r12 @ qs1 |
| 863 + |
| 864 + qsub8 r12, r9, r8 @ vp8_signed_char_clamp(vp8_fil
ter + 3 * ( qs0 - ps0)) |
| 865 + str r7, [sp, #12] @ store ps1 temporarily |
| 866 + qsub8 r7, r7, r10 @ vp8_signed_char_clamp(ps1-qs1
) |
| 867 + str r10,[sp, #8] @ store qs1 temporarily |
| 868 + qadd8 r7, r7, r12 |
| 869 + str r9, [sp] @ store qs0 temporarily |
| 870 + qadd8 r7, r7, r12 |
| 871 + str r8, [sp, #4] @ store ps0 temporarily |
| 872 + qadd8 r7, r7, r12 @ vp8_filter: r7 |
| 873 + |
| 874 + ldr r10, c0x03030303 @ r10 = 3 --modified for vp8 |
| 875 + ldr r9, c0x04040404 |
| 876 + |
| 877 + and r7, r7, lr @ vp8_filter &= mask (lr is fre
e) |
| 878 + |
| 879 + mov r12, r7 @ Filter2: r12 |
| 880 + and r12, r12, r6 @ Filter2 &= hev |
| 881 + |
| 882 + @save bottom 3 bits so that we round one side +4 and the other +3 |
| 883 + qadd8 r8, r12, r9 @ Filter1 (r8) = vp8_signed_cha
r_clamp(Filter2+4) |
| 884 + qadd8 r12, r12, r10 @ Filter2 (r12) = vp8_signed_ch
ar_clamp(Filter2+3) |
| 885 + |
| 886 + mov r10, #0 |
| 887 + shadd8 r8, r8, r10 @ Filter1 >>= 3 |
| 888 + shadd8 r12, r12, r10 @ Filter2 >>= 3 |
| 889 + shadd8 r8, r8, r10 |
| 890 + shadd8 r12, r12, r10 |
| 891 + shadd8 r8, r8, r10 @ r8: Filter1 |
| 892 + shadd8 r12, r12, r10 @ r12: Filter2 |
| 893 + |
| 894 + ldr r9, [sp] @ load qs0 |
| 895 + ldr r11,[sp, #4] @ load ps0 |
| 896 + |
| 897 + qsub8 r9, r9, r8 @ qs0 = vp8_signed_char_clamp(q
s0 - Filter1) |
| 898 + qadd8 r11, r11, r12 @ ps0 = vp8_signed_char_clamp(p
s0 + Filter2) |
| 899 + |
| 900 + bic r12, r7, r6 @ vp8_filter &= ~hev ( r6 is
free) |
| 901 + |
| 902 + @roughly 3/7th difference across boundary |
| 903 + mov lr, #0x1b @ 27 |
| 904 + mov r7, #0x3f @ 63 |
| 905 + |
| 906 + sxtb16 r6, r12 |
| 907 + sxtb16 r10, r12, ror #8 |
| 908 + smlabb r8, r6, lr, r7 |
| 909 + smlatb r6, r6, lr, r7 |
| 910 + smlabb r7, r10, lr, r7 |
| 911 + smultb r10, r10, lr |
| 912 + ssat r8, #8, r8, asr #7 |
| 913 + ssat r6, #8, r6, asr #7 |
| 914 + add r10, r10, #63 |
| 915 + ssat r7, #8, r7, asr #7 |
| 916 + ssat r10, #8, r10, asr #7 |
| 917 + |
| 918 + ldr lr, c0x80808080 |
| 919 + |
| 920 + pkhbt r6, r8, r6, lsl #16 |
| 921 + pkhbt r10, r7, r10, lsl #16 |
| 922 + uxtb16 r6, r6 |
| 923 + uxtb16 r10, r10 |
| 924 + |
| 925 + sub r0, r0, r1 |
| 926 + |
| 927 + orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63
+ Filter2 * 27)>>7) |
| 928 + |
| 929 + qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs0
- u) |
| 930 + qadd8 r10, r11, r10 @ s = vp8_signed_char_clamp(ps0
+ u) |
| 931 + eor r8, r8, lr @ *oq0 = s^0x80 |
| 932 + str r8, [r0] @ store *oq0 |
| 933 + sub r0, r0, r1 |
| 934 + eor r10, r10, lr @ *op0 = s^0x80 |
| 935 + str r10,[r0] @ store *op0 |
| 936 + |
| 937 + @roughly 2/7th difference across boundary |
| 938 + mov lr, #0x12 @ 18 |
| 939 + mov r7, #0x3f @ 63 |
| 940 + |
| 941 + sxtb16 r6, r12 |
| 942 + sxtb16 r10, r12, ror #8 |
| 943 + smlabb r8, r6, lr, r7 |
| 944 + smlatb r6, r6, lr, r7 |
| 945 + smlabb r9, r10, lr, r7 |
| 946 + smlatb r10, r10, lr, r7 |
| 947 + ssat r8, #8, r8, asr #7 |
| 948 + ssat r6, #8, r6, asr #7 |
| 949 + ssat r9, #8, r9, asr #7 |
| 950 + ssat r10, #8, r10, asr #7 |
| 951 + |
| 952 + ldr lr, c0x80808080 |
| 953 + |
| 954 + pkhbt r6, r8, r6, lsl #16 |
| 955 + pkhbt r10, r9, r10, lsl #16 |
| 956 + |
| 957 + ldr r9, [sp, #8] @ load qs1 |
| 958 + ldr r11, [sp, #12] @ load ps1 |
| 959 + |
| 960 + uxtb16 r6, r6 |
| 961 + uxtb16 r10, r10 |
| 962 + |
| 963 + sub r0, r0, r1 |
| 964 + |
| 965 + orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63
+ Filter2 * 18)>>7) |
| 966 + |
| 967 + qadd8 r11, r11, r10 @ s = vp8_signed_char_clamp(ps1
+ u) |
| 968 + qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs1
- u) |
| 969 + eor r11, r11, lr @ *op1 = s^0x80 |
| 970 + str_post r11, r0, r1 @ store *op1 |
| 971 + eor r8, r8, lr @ *oq1 = s^0x80 |
| 972 + add r0, r0, r1, lsl #1 |
| 973 + |
| 974 + mov r7, #0x3f @ 63 |
| 975 + |
| 976 + str_post r8, r0, r1 @ store *oq1 |
| 977 + |
| 978 + @roughly 1/7th difference across boundary |
| 979 + mov lr, #0x9 @ 9 |
| 980 + ldr r9, [r0] @ load q2 |
| 981 + |
| 982 + sxtb16 r6, r12 |
| 983 + sxtb16 r10, r12, ror #8 |
| 984 + smlabb r8, r6, lr, r7 |
| 985 + smlatb r6, r6, lr, r7 |
| 986 + smlabb r12, r10, lr, r7 |
| 987 + smlatb r10, r10, lr, r7 |
| 988 + ssat r8, #8, r8, asr #7 |
| 989 + ssat r6, #8, r6, asr #7 |
| 990 + ssat r12, #8, r12, asr #7 |
| 991 + ssat r10, #8, r10, asr #7 |
| 992 + |
| 993 + sub r0, r0, r1, lsl #2 |
| 994 + |
| 995 + pkhbt r6, r8, r6, lsl #16 |
| 996 + pkhbt r10, r12, r10, lsl #16 |
| 997 + |
| 998 + sub r0, r0, r1 |
| 999 + ldr lr, c0x80808080 |
| 1000 + |
| 1001 + ldr r11, [r0] @ load p2 |
| 1002 + |
| 1003 + uxtb16 r6, r6 |
| 1004 + uxtb16 r10, r10 |
| 1005 + |
| 1006 + eor r9, r9, lr |
| 1007 + eor r11, r11, lr |
| 1008 + |
| 1009 + orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63
+ Filter2 * 9)>>7) |
| 1010 + |
| 1011 + qadd8 r8, r11, r10 @ s = vp8_signed_char_clamp(ps2
+ u) |
| 1012 + qsub8 r10, r9, r10 @ s = vp8_signed_char_clamp(qs2
- u) |
| 1013 + eor r8, r8, lr @ *op2 = s^0x80 |
| 1014 + str_post r8, r0, r1, lsl #2 @ store *op2 |
| 1015 + add r0, r0, r1 |
| 1016 + eor r10, r10, lr @ *oq2 = s^0x80 |
| 1017 + str_post r10, r0, r1, lsl #1 @ store *oq2 |
| 1018 + |
| 1019 +2: |
| 1020 + add r0, r0, #4 |
| 1021 + sub r0, r0, r1, lsl #3 |
| 1022 + subs r5, r5, #1 |
| 1023 + |
| 1024 +T ittt ne |
| 1025 + ldrne r10,[r0, r1] @ p2 |
| 1026 +A ldrne r9, [r0], r1, lsl #1 @ p3 |
| 1027 +T ldrne r9, [r0] @ p3 |
| 1028 +T addne r0, r0, r1, lsl #1 |
| 1029 +T ittt ne |
| 1030 + ldrne r12,[r0, r1] @ p0 |
| 1031 +A ldrne r11,[r0], r1, lsl #1 @ p1 |
| 1032 +T ldrne r11,[r0] @ p3 |
| 1033 +T addne r0, r0, r1, lsl #1 |
| 1034 + |
| 1035 + bne 1b |
| 1036 + |
| 1037 + add sp, sp, #16 |
| 1038 + pop {r4 - r11, pc} |
| 1039 +endfunc |
| 1040 + |
| 1041 +.macro TRANSPOSE_MATRIX i0, i1, i2, i3, o3, o2, o1, o0 |
| 1042 + @ input: $0, $1, $2, $3 |
| 1043 + @ output: $4, $5, $6, $7 |
| 1044 + @ i0: 03 02 01 00 |
| 1045 + @ i1: 13 12 11 10 |
| 1046 + @ i2: 23 22 21 20 |
| 1047 + @ i3: 33 32 31 30 |
| 1048 + @ o3 o2 o1 o0 |
| 1049 + |
| 1050 + uxtb16 \o1, \i1 @ xx 12 xx 10 |
| 1051 + uxtb16 \o0, \i0 @ xx 02 xx 00 |
| 1052 + uxtb16 \o3, \i3 @ xx 32 xx 30 |
| 1053 + uxtb16 \o2, \i2 @ xx 22 xx 20 |
| 1054 + orr \o1, \o0, \o1, lsl #8 @ 12 02 10 00 |
| 1055 + orr \o3, \o2, \o3, lsl #8 @ 32 22 30 20 |
| 1056 + |
| 1057 + uxtb16 \i1, \i1, ror #8 @ xx 13 xx 11 |
| 1058 + uxtb16 \i3, \i3, ror #8 @ xx 33 xx 31 |
| 1059 + uxtb16 \i0, \i0, ror #8 @ xx 03 xx 01 |
| 1060 + uxtb16 \i2, \i2, ror #8 @ xx 23 xx 21 |
| 1061 + orr \i0, \i0, \i1, lsl #8 @ 13 03 11 01 |
| 1062 + orr \i2, \i2, \i3, lsl #8 @ 33 23 31 21 |
| 1063 + |
| 1064 + pkhtb \o2, \o3, \o1, asr #16 @ 32 22 12 02 -- p1 |
| 1065 + pkhbt \o0, \o1, \o3, lsl #16 @ 30 20 10 00 -- p3 |
| 1066 + |
| 1067 + pkhtb \o3, \i2, \i0, asr #16 @ 33 23 13 03 -- p0 |
| 1068 + pkhbt \o1, \i0, \i2, lsl #16 @ 31 21 11 01 -- p2 |
| 1069 +.endm |
| 1070 + |
| 1071 +@ void vp8_h_loop_filter16_simple(uint8_t *dst, int stride, int flim) |
| 1072 +function ff_vp8_h_loop_filter16_simple_armv6, export=1 |
| 1073 + push {r4 - r11, lr} |
| 1074 + orr r12, r2, r2, lsl #16 |
| 1075 + ldr r2, c0x80808080 |
| 1076 + orr r12, r12, r12, lsl #8 |
| 1077 + |
| 1078 + @ load soure data to r7, r8, r9, r10 |
| 1079 + sub r0, r0, #2 |
| 1080 + ldr r8, [r0, r1] |
| 1081 + ldr_post r7, r0, r1, lsl #1 |
| 1082 + ldr r10,[r0, r1] |
| 1083 + ldr_post r9, r0, r1, lsl #1 |
| 1084 + add r0, r0, #2 |
| 1085 + |
| 1086 + mov r11, #4 @ count (r11) for 4-in-parallel |
| 1087 +1: |
| 1088 + @transpose r7, r8, r9, r10 to r3, r4, r5, r6 |
| 1089 + TRANSPOSE_MATRIX r7, r8, r9, r10, r6, r5, r4, r3 |
| 1090 + |
| 1091 + @ vp8_simple_filter_mask() function |
| 1092 + uqsub8 r7, r3, r6 @ p1 - q1 |
| 1093 + uqsub8 r8, r6, r3 @ q1 - p1 |
| 1094 + uqsub8 r9, r4, r5 @ p0 - q0 |
| 1095 + uqsub8 r10, r5, r4 @ q0 - p0 |
| 1096 + orr r7, r7, r8 @ abs(p1 - q1) |
| 1097 + orr r9, r9, r10 @ abs(p0 - q0) |
| 1098 + mov r8, #0 |
| 1099 + uqadd8 r9, r9, r9 @ abs(p0 - q0) * 2 |
| 1100 + uhadd8 r7, r7, r8 @ abs(p1 - q1) / 2 |
| 1101 + uqadd8 r7, r7, r9 @ abs(p0 - q0)*2 + abs(p1 - q1)
/2 |
| 1102 + mvn r10, #0 @ r10 == -1 |
| 1103 + |
| 1104 + usub8 r7, r12, r7 @ compare to flimit |
| 1105 + sel lr, r10, r8 @ filter mask |
| 1106 + |
| 1107 + cmp lr, #0 |
| 1108 + beq 2f @ skip filtering |
| 1109 + |
| 1110 + @vp8_simple_filter() function |
| 1111 + eor r3, r3, r2 @ p1 offset to convert to a sig
ned value |
| 1112 + eor r6, r6, r2 @ q1 offset to convert to a sig
ned value |
| 1113 + eor r4, r4, r2 @ p0 offset to convert to a sig
ned value |
| 1114 + eor r5, r5, r2 @ q0 offset to convert to a sig
ned value |
| 1115 + |
| 1116 + qsub8 r3, r3, r6 @ vp8_filter = p1 - q1 |
| 1117 + qsub8 r6, r5, r4 @ q0 - p0 |
| 1118 + |
| 1119 + qadd8 r3, r3, r6 @ vp8_filter += q0 - p0 |
| 1120 + ldr r9, c0x03030303 @ r9 = 3 |
| 1121 + |
| 1122 + qadd8 r3, r3, r6 @ vp8_filter += q0 - p0 |
| 1123 + ldr r7, c0x04040404 |
| 1124 + |
| 1125 + qadd8 r3, r3, r6 @ vp8_filter = p1-q1 + 3*(q0-p0
)) |
| 1126 + @STALL |
| 1127 + and r3, r3, lr @ vp8_filter &= mask |
| 1128 + |
| 1129 + qadd8 r9, r3, r9 @ Filter2 = vp8_filter + 3 |
| 1130 + qadd8 r3, r3, r7 @ Filter1 = vp8_filter + 4 |
| 1131 + |
| 1132 + shadd8 r9, r9, r8 |
| 1133 + shadd8 r3, r3, r8 |
| 1134 + shadd8 r9, r9, r8 |
| 1135 + shadd8 r3, r3, r8 |
| 1136 + shadd8 r9, r9, r8 @ Filter2 >>= 3 |
| 1137 + shadd8 r3, r3, r8 @ Filter1 >>= 3 |
| 1138 + |
| 1139 + @calculate output |
| 1140 + sub r0, r0, r1, lsl #2 |
| 1141 + |
| 1142 + qadd8 r4, r4, r9 @ u = p0 + Filter2 |
| 1143 + qsub8 r5, r5, r3 @ u = q0 - Filter1 |
| 1144 + eor r4, r4, r2 @ *op0 = u^0x80 |
| 1145 + eor r5, r5, r2 @ *oq0 = u^0x80 |
| 1146 + |
| 1147 + strb r4, [r0, #-1] @ store the result |
| 1148 + mov r4, r4, lsr #8 |
| 1149 + strb_post r5, r0, r1 |
| 1150 + mov r5, r5, lsr #8 |
| 1151 + |
| 1152 + strb r4, [r0, #-1] |
| 1153 + mov r4, r4, lsr #8 |
| 1154 + strb_post r5, r0, r1 |
| 1155 + mov r5, r5, lsr #8 |
| 1156 + |
| 1157 + strb r4, [r0, #-1] |
| 1158 + mov r4, r4, lsr #8 |
| 1159 + strb_post r5, r0, r1 |
| 1160 + mov r5, r5, lsr #8 |
| 1161 + |
| 1162 + strb r4, [r0, #-1] |
| 1163 + strb_post r5, r0, r1 |
| 1164 + |
| 1165 +2: |
| 1166 + subs r11, r11, #1 |
| 1167 + |
| 1168 + @ load soure data to r7, r8, r9, r10 |
| 1169 + sub r0, r0, #2 |
| 1170 +T ittt ne |
| 1171 + ldrne r8, [r0, r1] |
| 1172 +A ldrne r7, [r0], r1, lsl #1 |
| 1173 +T ldrne r7, [r0] |
| 1174 +T addne r0, r0, r1, lsl #1 |
| 1175 +T ittt ne |
| 1176 + ldrne r10,[r0, r1] |
| 1177 +A ldrne r9, [r0], r1, lsl #1 |
| 1178 +T ldrne r9, [r0] |
| 1179 +T addne r0, r0, r1, lsl #1 |
| 1180 + add r0, r0, #2 |
| 1181 + |
| 1182 + bne 1b |
| 1183 + |
| 1184 + pop {r4 - r11, pc} |
| 1185 +endfunc |
| 1186 + |
| 1187 +@ void vp8_h_loop_filter16_inner(uint8_t *dst, int stride, |
| 1188 +@ int fE, int fI, int hev_thresh) |
| 1189 +@ and |
| 1190 +@ void vp8_h_loop_filter8uv_inner(uint8_t *dstU, uint8_t *dstV, int stride, |
| 1191 +@ int fE, int fI, int hev_thresh) |
| 1192 +@ call: |
| 1193 +@ void vp8_h_loop_filter_inner(uint8_t *dst, int stride, |
| 1194 +@ int fE, int fI, int hev_thresh, int count) |
| 1195 +function ff_vp8_h_loop_filter_inner_armv6, export=1 |
| 1196 + push {r4 - r11, lr} |
| 1197 + |
| 1198 + sub r0, r0, #4 @ move r0 pointer down by 4 |
| 1199 + ldr r5, [sp, #40] @ counter |
| 1200 + ldr r9, [sp, #36] @ load thresh address |
| 1201 + sub sp, sp, #16 @ create temp buffer |
| 1202 + |
| 1203 + ldr r7, [r0, r1] @ transpose will make it into p
3-p0 |
| 1204 + ldr_post r6, r0, r1, lsl #1 @ load source data |
| 1205 + ldr lr, [r0, r1] |
| 1206 + ldr_post r8, r0, r1, lsl #1 |
| 1207 + |
| 1208 + orr r2, r2, r2, lsl #16 |
| 1209 + orr r3, r3, r3, lsl #16 |
| 1210 + orr r9, r9, r9, lsl #16 |
| 1211 + orr r4, r2, r2, lsl #8 @ flimE splat int -> byte |
| 1212 + orr r2, r3, r3, lsl #8 @ flimI splat int -> byte |
| 1213 + orr r3, r9, r9, lsl #8 @ thresh splat int -> byte |
| 1214 + |
| 1215 +1: |
| 1216 + @ vp8_filter_mask() function |
| 1217 + @ calculate breakout conditions |
| 1218 + @ transpose the source data for 4-in-parallel operation |
| 1219 + TRANSPOSE_MATRIX r6, r7, r8, lr, r12, r11, r10, r9 |
| 1220 + |
| 1221 + uqsub8 r7, r9, r10 @ p3 - p2 |
| 1222 + uqsub8 r8, r10, r9 @ p2 - p3 |
| 1223 + uqsub8 r9, r10, r11 @ p2 - p1 |
| 1224 + uqsub8 r10, r11, r10 @ p1 - p2 |
| 1225 + orr r7, r7, r8 @ abs (p3-p2) |
| 1226 + orr r10, r9, r10 @ abs (p2-p1) |
| 1227 + uqsub8 lr, r7, r2 @ compare to limit. lr: vp8_fil
ter_mask |
| 1228 + uqsub8 r10, r10, r2 @ compare to limit |
| 1229 + |
| 1230 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lin
es |
| 1231 + |
| 1232 + orr lr, lr, r10 |
| 1233 + |
| 1234 + uqsub8 r6, r11, r12 @ p1 - p0 |
| 1235 + uqsub8 r7, r12, r11 @ p0 - p1 |
| 1236 + add r0, r0, #4 @ move r0 pointer up by 4 |
| 1237 + orr r6, r6, r7 @ abs (p1-p0) |
| 1238 + str r11,[sp, #12] @ save p1 |
| 1239 + uqsub8 r10, r6, r2 @ compare to limit |
| 1240 + uqsub8 r11, r6, r3 @ compare to thresh |
| 1241 + orr lr, lr, r10 |
| 1242 + |
| 1243 + @ transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now |
| 1244 + @ transpose the source data for 4-in-parallel operation |
| 1245 + str r11,[sp] @ push r11 to stack |
| 1246 + ldr r7, [r0, r1] |
| 1247 + ldr_post r6, r0, r1, lsl #1 @ load source data |
| 1248 + str r12,[sp, #4] @ save current reg before load
q0 - q3 data |
| 1249 + str lr, [sp, #8] |
| 1250 + ldr lr, [r0, r1] |
| 1251 + ldr_post r8, r0, r1, lsl #1 |
| 1252 + |
| 1253 + TRANSPOSE_MATRIX r6, r7, r8, lr, r12, r11, r10, r9 |
| 1254 + |
| 1255 + ldr lr, [sp, #8] @ load back (f)limit accumulato
r |
| 1256 + |
| 1257 + uqsub8 r6, r12, r11 @ q3 - q2 |
| 1258 + uqsub8 r7, r11, r12 @ q2 - q3 |
| 1259 + uqsub8 r12, r11, r10 @ q2 - q1 |
| 1260 + uqsub8 r11, r10, r11 @ q1 - q2 |
| 1261 + orr r6, r6, r7 @ abs (q3-q2) |
| 1262 + orr r7, r12, r11 @ abs (q2-q1) |
| 1263 + uqsub8 r6, r6, r2 @ compare to limit |
| 1264 + uqsub8 r7, r7, r2 @ compare to limit |
| 1265 + ldr r11,[sp, #4] @ load back p0 |
| 1266 + ldr r12,[sp, #12] @ load back p1 |
| 1267 + orr lr, lr, r6 |
| 1268 + orr lr, lr, r7 |
| 1269 + |
| 1270 + uqsub8 r6, r11, r9 @ p0 - q0 |
| 1271 + uqsub8 r7, r9, r11 @ q0 - p0 |
| 1272 + uqsub8 r8, r12, r10 @ p1 - q1 |
| 1273 + uqsub8 r11, r10, r12 @ q1 - p1 |
| 1274 + orr r6, r6, r7 @ abs (p0-q0) |
| 1275 + ldr r7, c0x7F7F7F7F |
| 1276 + orr r8, r8, r11 @ abs (p1-q1) |
| 1277 + uqadd8 r6, r6, r6 @ abs (p0-q0) * 2 |
| 1278 + and r8, r7, r8, lsr #1 @ abs (p1-q1) / 2 |
| 1279 + uqsub8 r11, r10, r9 @ q1 - q0 |
| 1280 + uqadd8 r6, r8, r6 @ abs (p0-q0)*2 + abs (p1-q1)/2 |
| 1281 + uqsub8 r12, r9, r10 @ q0 - q1 |
| 1282 + uqsub8 r6, r6, r4 @ compare to flimit |
| 1283 + |
| 1284 + orr r9, r11, r12 @ abs (q1-q0) |
| 1285 + uqsub8 r8, r9, r2 @ compare to limit |
| 1286 + uqsub8 r10, r9, r3 @ compare to thresh |
| 1287 + orr lr, lr, r6 |
| 1288 + orr lr, lr, r8 |
| 1289 + |
| 1290 + mvn r11, #0 @ r11 == -1 |
| 1291 + mov r12, #0 |
| 1292 + |
| 1293 + usub8 lr, r12, lr |
| 1294 + ldr r9, [sp] @ load the compared result |
| 1295 + sel lr, r11, r12 @ filter mask: lr |
| 1296 + |
| 1297 + cmp lr, #0 |
| 1298 + beq 2f @ skip filtering |
| 1299 + |
| 1300 + @vp8_hevmask() function |
| 1301 + @calculate high edge variance |
| 1302 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lin
es |
| 1303 + |
| 1304 + orr r9, r9, r10 |
| 1305 + |
| 1306 + ldrh r7, [r0, #-2] |
| 1307 + ldrh_post r8, r0, r1 |
| 1308 + |
| 1309 + usub8 r9, r12, r9 |
| 1310 + sel r6, r12, r11 @ hev mask: r6 |
| 1311 + |
| 1312 + @vp8_filter() function |
| 1313 + @ load soure data to r6, r11, r12, lr |
| 1314 + ldrh r9, [r0, #-2] |
| 1315 + ldrh_post r10, r0, r1 |
| 1316 + |
| 1317 + pkhbt r12, r7, r8, lsl #16 |
| 1318 + |
| 1319 + ldrh r7, [r0, #-2] |
| 1320 + ldrh_post r8, r0, r1 |
| 1321 + |
| 1322 + pkhbt r11, r9, r10, lsl #16 |
| 1323 + |
| 1324 + ldrh r9, [r0, #-2] |
| 1325 + ldrh_post r10, r0, r1 |
| 1326 + |
| 1327 + @ Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first |
| 1328 + str r6, [sp] |
| 1329 + str lr, [sp, #4] |
| 1330 + |
| 1331 + pkhbt r6, r7, r8, lsl #16 |
| 1332 + pkhbt lr, r9, r10, lsl #16 |
| 1333 + |
| 1334 + @transpose r12, r11, r6, lr to r7, r8, r9, r10 |
| 1335 + TRANSPOSE_MATRIX r12, r11, r6, lr, r10, r9, r8, r7 |
| 1336 + |
| 1337 + @load back hev_mask r6 and filter_mask lr |
| 1338 + ldr r12, c0x80808080 |
| 1339 + ldr r6, [sp] |
| 1340 + ldr lr, [sp, #4] |
| 1341 + |
| 1342 + eor r7, r7, r12 @ p1 offset to convert to a sig
ned value |
| 1343 + eor r8, r8, r12 @ p0 offset to convert to a sig
ned value |
| 1344 + eor r9, r9, r12 @ q0 offset to convert to a sig
ned value |
| 1345 + eor r10, r10, r12 @ q1 offset to convert to a sig
ned value |
| 1346 + |
| 1347 + str r9, [sp] @ store qs0 temporarily |
| 1348 + str r8, [sp, #4] @ store ps0 temporarily |
| 1349 + str r10,[sp, #8] @ store qs1 temporarily |
| 1350 + str r7, [sp, #12] @ store ps1 temporarily |
| 1351 + |
| 1352 + qsub8 r7, r7, r10 @ vp8_signed_char_clamp(ps1-qs1
) |
| 1353 + qsub8 r8, r9, r8 @ vp8_signed_char_clamp(vp8_fil
ter + 3 * ( qs0 - ps0)) |
| 1354 + |
| 1355 + and r7, r7, r6 @ vp8_filter (r7) &= hev (r7 :
filter) |
| 1356 + |
| 1357 + qadd8 r7, r7, r8 |
| 1358 + ldr r9, c0x03030303 @ r9 = 3 --modified for vp8 |
| 1359 + |
| 1360 + qadd8 r7, r7, r8 |
| 1361 + ldr r10, c0x04040404 |
| 1362 + |
| 1363 + qadd8 r7, r7, r8 |
| 1364 + |
| 1365 + and r7, r7, lr @ vp8_filter &= mask |
| 1366 + |
| 1367 + qadd8 r8, r7, r9 @ Filter2 (r8) = vp8_signed_cha
r_clamp(vp8_filter+3) |
| 1368 + qadd8 r7, r7, r10 @ vp8_filter = vp8_signed_char_
clamp(vp8_filter+4) |
| 1369 + |
| 1370 + mov r9, #0 |
| 1371 + shadd8 r8, r8, r9 @ Filter2 >>= 3 |
| 1372 + shadd8 r7, r7, r9 @ vp8_filter >>= 3 |
| 1373 + shadd8 r8, r8, r9 |
| 1374 + shadd8 r7, r7, r9 |
| 1375 + shadd8 lr, r8, r9 @ lr: filter2 |
| 1376 + shadd8 r7, r7, r9 @ r7: filter |
| 1377 + |
| 1378 + @calculate output |
| 1379 + ldr r8, [sp] @ load qs0 |
| 1380 + ldr r9, [sp, #4] @ load ps0 |
| 1381 + |
| 1382 + ldr r10, c0x01010101 |
| 1383 + |
| 1384 + qsub8 r8, r8, r7 @ u = vp8_signed_char_clamp(qs0
- vp8_filter) |
| 1385 + qadd8 r9, r9, lr @ u = vp8_signed_char_clamp(ps0
+ Filter2) |
| 1386 + |
| 1387 + eor r8, r8, r12 |
| 1388 + eor r9, r9, r12 |
| 1389 + |
| 1390 + mov lr, #0 |
| 1391 + |
| 1392 + sadd8 r7, r7, r10 |
| 1393 + shadd8 r7, r7, lr |
| 1394 + |
| 1395 + ldr r10,[sp, #8] @ load qs1 |
| 1396 + ldr r11,[sp, #12] @ load ps1 |
| 1397 + |
| 1398 + bic r7, r7, r6 @ r7: vp8_filter |
| 1399 + |
| 1400 + qsub8 r10, r10, r7 @ u = vp8_signed_char_clamp(qs1
- vp8_filter) |
| 1401 + qadd8 r11, r11, r7 @ u = vp8_signed_char_clamp(ps1
+ vp8_filter) |
| 1402 + eor r10, r10, r12 |
| 1403 + eor r11, r11, r12 |
| 1404 + |
| 1405 + sub r0, r0, r1, lsl #2 |
| 1406 + |
| 1407 + @we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0,
p0, p1 |
| 1408 + TRANSPOSE_MATRIX r11, r9, r8, r10, lr, r12, r7, r6 |
| 1409 + |
| 1410 + strh r6, [r0, #-2] @ store the result |
| 1411 + mov r6, r6, lsr #16 |
| 1412 + strh_post r6, r0, r1 |
| 1413 + |
| 1414 + strh r7, [r0, #-2] |
| 1415 + mov r7, r7, lsr #16 |
| 1416 + strh_post r7, r0, r1 |
| 1417 + |
| 1418 + strh r12, [r0, #-2] |
| 1419 + mov r12, r12, lsr #16 |
| 1420 + strh_post r12, r0, r1 |
| 1421 + |
| 1422 + strh lr, [r0, #-2] |
| 1423 + mov lr, lr, lsr #16 |
| 1424 + strh_post lr, r0, r1 |
| 1425 + |
| 1426 +2: |
| 1427 + sub r0, r0, #4 |
| 1428 + subs r5, r5, #1 |
| 1429 + |
| 1430 +T ittt ne |
| 1431 + ldrne r7, [r0, r1] |
| 1432 +A ldrne r6, [r0], r1, lsl #1 @ load source data |
| 1433 +T ldrne r6, [r0] @ load source data |
| 1434 +T addne r0, r0, r1, lsl #1 |
| 1435 +T ittt ne |
| 1436 + ldrne lr, [r0, r1] |
| 1437 +A ldrne r8, [r0], r1, lsl #1 |
| 1438 +T ldrne r8, [r0] |
| 1439 +T addne r0, r0, r1, lsl #1 |
| 1440 + |
| 1441 + bne 1b |
| 1442 + |
| 1443 + add sp, sp, #16 |
| 1444 + pop {r4 - r11, pc} |
| 1445 +endfunc |
| 1446 + |
| 1447 +@ void vp8_h_loop_filter16(uint8_t *dst, int stride, |
| 1448 +@ int fE, int fI, int hev_thresh) |
| 1449 +@ and |
| 1450 +@ void vp8_h_loop_filter8uv(uint8_t *dstU, uint8_t *dstV, int stride, |
| 1451 +@ int fE, int fI, int hev_thresh) |
| 1452 +@ call: |
| 1453 +@ void vp8_h_loop_filter(uint8_t *dst, int stride, |
| 1454 +@ int fE, int fI, int hev_thresh, int count) |
| 1455 +function ff_vp8_h_loop_filter_armv6, export=1 |
| 1456 + push {r4 - r11, lr} |
| 1457 + |
| 1458 + sub r0, r0, #4 @ move r0 pointer down by 4 |
| 1459 + ldr r5, [sp, #40] @ counter |
| 1460 + ldr r9, [sp, #36] @ load thresh address |
| 1461 + sub sp, sp, #16 @ create temp buffer |
| 1462 + |
| 1463 + ldr r7, [r0, r1] @ transpose will make it into p
3-p0 |
| 1464 + ldr_post r6, r0, r1, lsl #1 @ load source data |
| 1465 + ldr lr, [r0, r1] |
| 1466 + ldr_post r8, r0, r1, lsl #1 |
| 1467 + |
| 1468 + orr r2, r2, r2, lsl #16 |
| 1469 + orr r3, r3, r3, lsl #16 |
| 1470 + orr r9, r9, r9, lsl #16 |
| 1471 + orr r4, r2, r2, lsl #8 @ flimE splat int -> byte |
| 1472 + orr r2, r3, r3, lsl #8 @ flimI splat int -> byte |
| 1473 + orr r3, r9, r9, lsl #8 @ thresh splat int -> byte |
| 1474 + |
| 1475 +1: |
| 1476 + @ vp8_filter_mask() function |
| 1477 + @ calculate breakout conditions |
| 1478 + @ transpose the source data for 4-in-parallel operation |
| 1479 + TRANSPOSE_MATRIX r6, r7, r8, lr, r12, r11, r10, r9 |
| 1480 + |
| 1481 + uqsub8 r7, r9, r10 @ p3 - p2 |
| 1482 + uqsub8 r8, r10, r9 @ p2 - p3 |
| 1483 + uqsub8 r9, r10, r11 @ p2 - p1 |
| 1484 + uqsub8 r10, r11, r10 @ p1 - p2 |
| 1485 + orr r7, r7, r8 @ abs (p3-p2) |
| 1486 + orr r10, r9, r10 @ abs (p2-p1) |
| 1487 + uqsub8 lr, r7, r2 @ compare to limit. lr: vp8_fil
ter_mask |
| 1488 + uqsub8 r10, r10, r2 @ compare to limit |
| 1489 + |
| 1490 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lin
es |
| 1491 + |
| 1492 + orr lr, lr, r10 |
| 1493 + |
| 1494 + uqsub8 r6, r11, r12 @ p1 - p0 |
| 1495 + uqsub8 r7, r12, r11 @ p0 - p1 |
| 1496 + add r0, r0, #4 @ move r0 pointer up by 4 |
| 1497 + orr r6, r6, r7 @ abs (p1-p0) |
| 1498 + str r11,[sp, #12] @ save p1 |
| 1499 + uqsub8 r10, r6, r2 @ compare to limit |
| 1500 + uqsub8 r11, r6, r3 @ compare to thresh |
| 1501 + orr lr, lr, r10 |
| 1502 + |
| 1503 + @ transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now |
| 1504 + @ transpose the source data for 4-in-parallel operation |
| 1505 + str r11,[sp] @ push r11 to stack |
| 1506 + ldr r7, [r0, r1] |
| 1507 + ldr_post r6, r0, r1, lsl #1 @ load source data |
| 1508 + str r12,[sp, #4] @ save current reg before load
q0 - q3 data |
| 1509 + str lr, [sp, #8] |
| 1510 + ldr lr, [r0, r1] |
| 1511 + ldr_post r8, r0, r1, lsl #1 |
| 1512 + |
| 1513 + TRANSPOSE_MATRIX r6, r7, r8, lr, r12, r11, r10, r9 |
| 1514 + |
| 1515 + ldr lr, [sp, #8] @ load back (f)limit accumulato
r |
| 1516 + |
| 1517 + uqsub8 r6, r12, r11 @ q3 - q2 |
| 1518 + uqsub8 r7, r11, r12 @ q2 - q3 |
| 1519 + uqsub8 r12, r11, r10 @ q2 - q1 |
| 1520 + uqsub8 r11, r10, r11 @ q1 - q2 |
| 1521 + orr r6, r6, r7 @ abs (q3-q2) |
| 1522 + orr r7, r12, r11 @ abs (q2-q1) |
| 1523 + uqsub8 r6, r6, r2 @ compare to limit |
| 1524 + uqsub8 r7, r7, r2 @ compare to limit |
| 1525 + ldr r11,[sp, #4] @ load back p0 |
| 1526 + ldr r12,[sp, #12] @ load back p1 |
| 1527 + orr lr, lr, r6 |
| 1528 + orr lr, lr, r7 |
| 1529 + |
| 1530 + uqsub8 r6, r11, r9 @ p0 - q0 |
| 1531 + uqsub8 r7, r9, r11 @ q0 - p0 |
| 1532 + uqsub8 r8, r12, r10 @ p1 - q1 |
| 1533 + uqsub8 r11, r10, r12 @ q1 - p1 |
| 1534 + orr r6, r6, r7 @ abs (p0-q0) |
| 1535 + ldr r7, c0x7F7F7F7F |
| 1536 + orr r8, r8, r11 @ abs (p1-q1) |
| 1537 + uqadd8 r6, r6, r6 @ abs (p0-q0) * 2 |
| 1538 + and r8, r7, r8, lsr #1 @ abs (p1-q1) / 2 |
| 1539 + uqsub8 r11, r10, r9 @ q1 - q0 |
| 1540 + uqadd8 r6, r8, r6 @ abs (p0-q0)*2 + abs (p1-q1)/2 |
| 1541 + uqsub8 r12, r9, r10 @ q0 - q1 |
| 1542 + uqsub8 r6, r6, r4 @ compare to flimit |
| 1543 + |
| 1544 + orr r9, r11, r12 @ abs (q1-q0) |
| 1545 + uqsub8 r8, r9, r2 @ compare to limit |
| 1546 + uqsub8 r10, r9, r3 @ compare to thresh |
| 1547 + orr lr, lr, r6 |
| 1548 + orr lr, lr, r8 |
| 1549 + |
| 1550 + mvn r11, #0 @ r11 == -1 |
| 1551 + mov r12, #0 |
| 1552 + |
| 1553 + usub8 lr, r12, lr |
| 1554 + ldr r9, [sp] @ load the compared result |
| 1555 + sel lr, r11, r12 @ filter mask: lr |
| 1556 + |
| 1557 + cmp lr, #0 |
| 1558 + beq 2f @ skip filtering |
| 1559 + |
| 1560 + |
| 1561 + @vp8_hevmask() function |
| 1562 + @calculate high edge variance |
| 1563 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lin
es |
| 1564 + |
| 1565 + orr r9, r9, r10 |
| 1566 + |
| 1567 + ldrh r7, [r0, #-2] |
| 1568 + ldrh_post r8, r0, r1 |
| 1569 + |
| 1570 + usub8 r9, r12, r9 |
| 1571 + sel r6, r12, r11 @ hev mask: r6 |
| 1572 + |
| 1573 + |
| 1574 + @ vp8_mbfilter() function |
| 1575 + @ p2, q2 are only needed at the end. do not need to load them in now. |
| 1576 + @ Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first |
| 1577 + @ load soure data to r6, r11, r12, lr |
| 1578 + ldrh r9, [r0, #-2] |
| 1579 + ldrh_post r10, r0, r1 |
| 1580 + |
| 1581 + pkhbt r12, r7, r8, lsl #16 |
| 1582 + |
| 1583 + ldrh r7, [r0, #-2] |
| 1584 + ldrh_post r8, r0, r1 |
| 1585 + |
| 1586 + pkhbt r11, r9, r10, lsl #16 |
| 1587 + |
| 1588 + ldrh r9, [r0, #-2] |
| 1589 + ldrh_post r10, r0, r1 |
| 1590 + |
| 1591 + str r6, [sp] @ save r6 |
| 1592 + str lr, [sp, #4] @ save lr |
| 1593 + |
| 1594 + pkhbt r6, r7, r8, lsl #16 |
| 1595 + pkhbt lr, r9, r10, lsl #16 |
| 1596 + |
| 1597 + @transpose r12, r11, r6, lr to p1, p0, q0, q1 |
| 1598 + TRANSPOSE_MATRIX r12, r11, r6, lr, r10, r9, r8, r7 |
| 1599 + |
| 1600 + @load back hev_mask r6 and filter_mask lr |
| 1601 + ldr r12, c0x80808080 |
| 1602 + ldr r6, [sp] |
| 1603 + ldr lr, [sp, #4] |
| 1604 + |
| 1605 + eor r7, r7, r12 @ ps1 |
| 1606 + eor r8, r8, r12 @ ps0 |
| 1607 + eor r9, r9, r12 @ qs0 |
| 1608 + eor r10, r10, r12 @ qs1 |
| 1609 + |
| 1610 + qsub8 r12, r9, r8 @ vp8_signed_char_clamp(vp8_fil
ter + 3 * ( qs0 - ps0)) |
| 1611 + str r7, [sp, #12] @ store ps1 temporarily |
| 1612 + qsub8 r7, r7, r10 @ vp8_signed_char_clamp(ps1-qs1
) |
| 1613 + str r10,[sp, #8] @ store qs1 temporarily |
| 1614 + qadd8 r7, r7, r12 |
| 1615 + str r9, [sp] @ store qs0 temporarily |
| 1616 + qadd8 r7, r7, r12 |
| 1617 + str r8, [sp, #4] @ store ps0 temporarily |
| 1618 + qadd8 r7, r7, r12 @ vp8_filter: r7 |
| 1619 + |
| 1620 + ldr r10, c0x03030303 @ r10 = 3 --modified for vp8 |
| 1621 + ldr r9, c0x04040404 |
| 1622 + |
| 1623 + and r7, r7, lr @ vp8_filter &= mask (lr is fre
e) |
| 1624 + |
| 1625 + mov r12, r7 @ Filter2: r12 |
| 1626 + and r12, r12, r6 @ Filter2 &= hev |
| 1627 + |
| 1628 + @save bottom 3 bits so that we round one side +4 and the other +3 |
| 1629 + qadd8 r8, r12, r9 @ Filter1 (r8) = vp8_signed_cha
r_clamp(Filter2+4) |
| 1630 + qadd8 r12, r12, r10 @ Filter2 (r12) = vp8_signed_ch
ar_clamp(Filter2+3) |
| 1631 + |
| 1632 + mov r10, #0 |
| 1633 + shadd8 r8, r8, r10 @ Filter1 >>= 3 |
| 1634 + shadd8 r12, r12, r10 @ Filter2 >>= 3 |
| 1635 + shadd8 r8, r8, r10 |
| 1636 + shadd8 r12, r12, r10 |
| 1637 + shadd8 r8, r8, r10 @ r8: Filter1 |
| 1638 + shadd8 r12, r12, r10 @ r12: Filter2 |
| 1639 + |
| 1640 + ldr r9, [sp] @ load qs0 |
| 1641 + ldr r11,[sp, #4] @ load ps0 |
| 1642 + |
| 1643 + qsub8 r9, r9, r8 @ qs0 = vp8_signed_char_clamp(q
s0 - Filter1) |
| 1644 + qadd8 r11, r11, r12 @ ps0 = vp8_signed_char_clamp(p
s0 + Filter2) |
| 1645 + |
| 1646 + bic r12, r7, r6 @vp8_filter &= ~hev ( r6 is
free) |
| 1647 + |
| 1648 + @roughly 3/7th difference across boundary |
| 1649 + mov lr, #0x1b @ 27 |
| 1650 + mov r7, #0x3f @ 63 |
| 1651 + |
| 1652 + sxtb16 r6, r12 |
| 1653 + sxtb16 r10, r12, ror #8 |
| 1654 + smlabb r8, r6, lr, r7 |
| 1655 + smlatb r6, r6, lr, r7 |
| 1656 + smlabb r7, r10, lr, r7 |
| 1657 + smultb r10, r10, lr |
| 1658 + ssat r8, #8, r8, asr #7 |
| 1659 + ssat r6, #8, r6, asr #7 |
| 1660 + add r10, r10, #63 |
| 1661 + ssat r7, #8, r7, asr #7 |
| 1662 + ssat r10, #8, r10, asr #7 |
| 1663 + |
| 1664 + ldr lr, c0x80808080 |
| 1665 + |
| 1666 + pkhbt r6, r8, r6, lsl #16 |
| 1667 + pkhbt r10, r7, r10, lsl #16 |
| 1668 + uxtb16 r6, r6 |
| 1669 + uxtb16 r10, r10 |
| 1670 + |
| 1671 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lin
es |
| 1672 + |
| 1673 + orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63
+ Filter2 * 27)>>7) |
| 1674 + |
| 1675 + qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs0
- u) |
| 1676 + qadd8 r10, r11, r10 @ s = vp8_signed_char_clamp(ps0
+ u) |
| 1677 + eor r8, r8, lr @ *oq0 = s^0x80 |
| 1678 + eor r10, r10, lr @ *op0 = s^0x80 |
| 1679 + |
| 1680 + strb r10,[r0, #-1] @ store op0 result |
| 1681 + strb_post r8, r0, r1 @ store oq0 result |
| 1682 + mov r10, r10, lsr #8 |
| 1683 + mov r8, r8, lsr #8 |
| 1684 + strb r10,[r0, #-1] |
| 1685 + strb_post r8, r0, r1 |
| 1686 + mov r10, r10, lsr #8 |
| 1687 + mov r8, r8, lsr #8 |
| 1688 + strb r10,[r0, #-1] |
| 1689 + strb_post r8, r0, r1 |
| 1690 + mov r10, r10, lsr #8 |
| 1691 + mov r8, r8, lsr #8 |
| 1692 + strb r10,[r0, #-1] |
| 1693 + strb_post r8, r0, r1 |
| 1694 + |
| 1695 + @roughly 2/7th difference across boundary |
| 1696 + mov lr, #0x12 @ 18 |
| 1697 + mov r7, #0x3f @ 63 |
| 1698 + |
| 1699 + sxtb16 r6, r12 |
| 1700 + sxtb16 r10, r12, ror #8 |
| 1701 + smlabb r8, r6, lr, r7 |
| 1702 + smlatb r6, r6, lr, r7 |
| 1703 + smlabb r9, r10, lr, r7 |
| 1704 + smlatb r10, r10, lr, r7 |
| 1705 + ssat r8, #8, r8, asr #7 |
| 1706 + ssat r6, #8, r6, asr #7 |
| 1707 + ssat r9, #8, r9, asr #7 |
| 1708 + ssat r10, #8, r10, asr #7 |
| 1709 + |
| 1710 + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4 lin
es |
| 1711 + |
| 1712 + pkhbt r6, r8, r6, lsl #16 |
| 1713 + pkhbt r10, r9, r10, lsl #16 |
| 1714 + |
| 1715 + ldr r9, [sp, #8] @ load qs1 |
| 1716 + ldr r11,[sp, #12] @ load ps1 |
| 1717 + ldr lr, c0x80808080 |
| 1718 + |
| 1719 + uxtb16 r6, r6 |
| 1720 + uxtb16 r10, r10 |
| 1721 + |
| 1722 + add r0, r0, #2 |
| 1723 + |
| 1724 + orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63
+ Filter2 * 18)>>7) |
| 1725 + |
| 1726 + qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs1
- u) |
| 1727 + qadd8 r10, r11, r10 @ s = vp8_signed_char_clamp(ps1
+ u) |
| 1728 + eor r8, r8, lr @ *oq1 = s^0x80 |
| 1729 + eor r10, r10, lr @ *op1 = s^0x80 |
| 1730 + |
| 1731 + ldrb r11,[r0, #-5] @ load p2 for 1/7th difference
across boundary |
| 1732 + strb r10,[r0, #-4] @ store op1 |
| 1733 + strb r8, [r0, #-1] @ store oq1 |
| 1734 + ldrb_post r9, r0, r1 @ load q2 for 1/7th difference
across boundary |
| 1735 + |
| 1736 + mov r10, r10, lsr #8 |
| 1737 + mov r8, r8, lsr #8 |
| 1738 + |
| 1739 + ldrb r6, [r0, #-5] |
| 1740 + strb r10,[r0, #-4] |
| 1741 + strb r8, [r0, #-1] |
| 1742 + ldrb_post r7, r0, r1 |
| 1743 + |
| 1744 + mov r10, r10, lsr #8 |
| 1745 + mov r8, r8, lsr #8 |
| 1746 + orr r11, r11, r6, lsl #8 |
| 1747 + orr r9, r9, r7, lsl #8 |
| 1748 + |
| 1749 + ldrb r6, [r0, #-5] |
| 1750 + strb r10,[r0, #-4] |
| 1751 + strb r8, [r0, #-1] |
| 1752 + ldrb_post r7, r0, r1 |
| 1753 + |
| 1754 + mov r10, r10, lsr #8 |
| 1755 + mov r8, r8, lsr #8 |
| 1756 + orr r11, r11, r6, lsl #16 |
| 1757 + orr r9, r9, r7, lsl #16 |
| 1758 + |
| 1759 + ldrb r6, [r0, #-5] |
| 1760 + strb r10,[r0, #-4] |
| 1761 + strb r8, [r0, #-1] |
| 1762 + ldrb_post r7, r0, r1 |
| 1763 + orr r11, r11, r6, lsl #24 |
| 1764 + orr r9, r9, r7, lsl #24 |
| 1765 + |
| 1766 + @roughly 1/7th difference across boundary |
| 1767 + eor r9, r9, lr |
| 1768 + eor r11, r11, lr |
| 1769 + |
| 1770 + mov lr, #0x9 @ 9 |
| 1771 + mov r7, #0x3f @ 63 |
| 1772 + |
| 1773 + sxtb16 r6, r12 |
| 1774 + sxtb16 r10, r12, ror #8 |
| 1775 + smlabb r8, r6, lr, r7 |
| 1776 + smlatb r6, r6, lr, r7 |
| 1777 + smlabb r12, r10, lr, r7 |
| 1778 + smlatb r10, r10, lr, r7 |
| 1779 + ssat r8, #8, r8, asr #7 |
| 1780 + ssat r6, #8, r6, asr #7 |
| 1781 + ssat r12, #8, r12, asr #7 |
| 1782 + ssat r10, #8, r10, asr #7 |
| 1783 + |
| 1784 + sub r0, r0, r1, lsl #2 |
| 1785 + |
| 1786 + pkhbt r6, r8, r6, lsl #16 |
| 1787 + pkhbt r10, r12, r10, lsl #16 |
| 1788 + |
| 1789 + uxtb16 r6, r6 |
| 1790 + uxtb16 r10, r10 |
| 1791 + |
| 1792 + ldr lr, c0x80808080 |
| 1793 + |
| 1794 + orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63
+ Filter2 * 9)>>7) |
| 1795 + |
| 1796 + qadd8 r8, r11, r10 @ s = vp8_signed_char_clamp(ps2
+ u) |
| 1797 + qsub8 r10, r9, r10 @ s = vp8_signed_char_clamp(qs2
- u) |
| 1798 + eor r8, r8, lr @ *op2 = s^0x80 |
| 1799 + eor r10, r10, lr @ *oq2 = s^0x80 |
| 1800 + |
| 1801 + strb r8, [r0, #-5] @ store *op2 |
| 1802 + strb_post r10, r0, r1 @ store *oq2 |
| 1803 + mov r8, r8, lsr #8 |
| 1804 + mov r10, r10, lsr #8 |
| 1805 + strb r8, [r0, #-5] |
| 1806 + strb_post r10, r0, r1 |
| 1807 + mov r8, r8, lsr #8 |
| 1808 + mov r10, r10, lsr #8 |
| 1809 + strb r8, [r0, #-5] |
| 1810 + strb_post r10, r0, r1 |
| 1811 + mov r8, r8, lsr #8 |
| 1812 + mov r10, r10, lsr #8 |
| 1813 + strb r8, [r0, #-5] |
| 1814 + strb_post r10, r0, r1 |
| 1815 + |
| 1816 + @adjust r0 pointer for next loop |
| 1817 + sub r0, r0, #2 |
| 1818 + |
| 1819 +2: |
| 1820 + sub r0, r0, #4 |
| 1821 + subs r5, r5, #1 |
| 1822 + |
| 1823 +T ittt ne |
| 1824 + ldrne r7, [r0, r1] |
| 1825 +A ldrne r6, [r0], r1, lsl #1 @ load source data |
| 1826 +T ldrne r6, [r0] |
| 1827 +T addne r0, r0, r1, lsl #1 |
| 1828 +T ittt ne |
| 1829 + ldrne lr, [r0, r1] |
| 1830 +A ldrne r8, [r0], r1, lsl #1 |
| 1831 +T ldrne r8, [r0] |
| 1832 +T addne r0, r0, r1, lsl #1 |
| 1833 + |
| 1834 + bne 1b |
| 1835 + |
| 1836 + add sp, sp, #16 |
| 1837 + pop {r4 - r11, pc} |
| 1838 +endfunc |
| 1839 + |
| 1840 +@ MC |
| 1841 + |
| 1842 +@ void put_vp8_pixels16(uint8_t *dst, int dststride, uint8_t *src, |
| 1843 +@ int srcstride, int h, int mx, int my) |
| 1844 +function ff_put_vp8_pixels16_armv6, export=1 |
| 1845 + push {r4 - r11} |
| 1846 + ldr r12,[sp, #32] @ h |
| 1847 +1: |
| 1848 + subs r12, r12, #2 |
| 1849 + ldr r5, [r2, #4] |
| 1850 + ldr r6, [r2, #8] |
| 1851 + ldr r7, [r2, #12] |
| 1852 + ldr_post r4, r2, r3 |
| 1853 + ldr r9, [r2, #4] |
| 1854 + ldr r10,[r2, #8] |
| 1855 + ldr r11,[r2, #12] |
| 1856 + ldr_post r8, r2, r3 |
| 1857 + strd r6, r7, [r0, #8] |
| 1858 + strd_post r4, r5, r0, r1 |
| 1859 + strd r10, r11,[r0, #8] |
| 1860 + strd_post r8, r9, r0, r1 |
| 1861 + bgt 1b |
| 1862 + pop {r4 - r11} |
| 1863 + bx lr |
| 1864 +endfunc |
| 1865 + |
| 1866 +@ void put_vp8_pixels8(uint8_t *dst, int dststride, uint8_t *src, |
| 1867 +@ int srcstride, int h, int mx, int my) |
| 1868 +function ff_put_vp8_pixels8_armv6, export=1 |
| 1869 + push {r4 - r11} |
| 1870 + ldr r12,[sp, #32] @ h |
| 1871 +1: |
| 1872 + subs r12, r12, #4 |
| 1873 + ldr r5, [r2, #4] |
| 1874 + ldr_post r4, r2, r3 |
| 1875 + ldr r7, [r2, #4] |
| 1876 + ldr_post r6, r2, r3 |
| 1877 + ldr r9, [r2, #4] |
| 1878 + ldr_post r8, r2, r3 |
| 1879 + ldr r11,[r2, #4] |
| 1880 + ldr_post r10, r2, r3 |
| 1881 + strd_post r4, r5, r0, r1 |
| 1882 + strd_post r6, r7, r0, r1 |
| 1883 + strd_post r8, r9, r0, r1 |
| 1884 + strd_post r10, r11, r0, r1 |
| 1885 + bgt 1b |
| 1886 + pop {r4 - r11} |
| 1887 + bx lr |
| 1888 +endfunc |
| 1889 + |
| 1890 +@ void put_vp8_pixels4(uint8_t *dst, int dststride, uint8_t *src, |
| 1891 +@ int srcstride, int h, int mx, int my) |
| 1892 +function ff_put_vp8_pixels4_armv6, export=1 |
| 1893 + ldr r12, [sp, #0] @ h |
| 1894 + push {r4 - r6, lr} |
| 1895 +1: |
| 1896 + subs r12, r12, #4 |
| 1897 + ldr r5, [r2, r3] |
| 1898 + ldr_post r4, r2, r3, lsl #1 |
| 1899 + ldr lr, [r2, r3] |
| 1900 + ldr_post r6, r2, r3, lsl #1 |
| 1901 + str r5, [r0, r1] |
| 1902 + str_post r4, r0, r1, lsl #1 |
| 1903 + str lr, [r0, r1] |
| 1904 + str_post r6, r0, r1, lsl #1 |
| 1905 + bgt 1b |
| 1906 + pop {r4 - r6, pc} |
| 1907 +endfunc |
| 1908 + |
| 1909 +@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit |
| 1910 +@ arithmatic can be used to apply filters |
| 1911 +const sixtap_filters_13245600, align=4 |
| 1912 + .short 2, 108, -11, 36, -8, 1, 0, 0 |
| 1913 + .short 3, 77, -16, 77, -16, 3, 0, 0 |
| 1914 + .short 1, 36, -8, 108, -11, 2, 0, 0 |
| 1915 +endconst |
| 1916 +const fourtap_filters_1324, align=4 |
| 1917 + .short -6, 12, 123, -1 |
| 1918 + .short -9, 50, 93, -6 |
| 1919 + .short -6, 93, 50, -9 |
| 1920 + .short -1, 123, 12, -6 |
| 1921 +endconst |
| 1922 + |
| 1923 +@ void put_vp8_epel_h6(uint8_t *dst, int dststride, uint8_t *src, |
| 1924 +@ int srcstride, int w, int h, int mx) |
| 1925 +function ff_put_vp8_epel_h6_armv6, export=1 |
| 1926 + push {r4 - r11, lr} |
| 1927 + |
| 1928 + sub r2, r2, #2 |
| 1929 + movrel lr, sixtap_filters_13245600 - 16 |
| 1930 + ldr r12,[sp, #44] @ vp8_filter index |
| 1931 + ldr r4, [sp, #36] @ width |
| 1932 + add lr, lr, r12, lsl #3 |
| 1933 + sub r3, r3, r4 @ src_stride - block_width |
| 1934 + sub r1, r1, r4 @ dst_stride - block_width |
| 1935 + lsr r4, #2 |
| 1936 + |
| 1937 + str r4, [sp, #36] @ "4-in-parallel" loop counter
@40 |
| 1938 + str r3, [sp, #44] @ src_stride - block_width @48 |
| 1939 + push {r1} @ dst_stride - block_width @0 |
| 1940 + @ height @44 |
| 1941 + |
| 1942 + ldr r1, [lr], #4 @ coefficients |
| 1943 + ldr r3, [lr], #4 |
| 1944 + ldr lr, [lr] |
| 1945 +1: |
| 1946 + @ 3 loads, 10 shuffles and then mul/acc/add/shr |
| 1947 + @ o0: i0/i1/i2/i3/i4/i5 -> i0/i2 (ld1) | i1/i3 (ld1) | i4/i5 (ld2) |
| 1948 + @ o1: i1/i2/i3/i4/i5/i6 -> i1/i3 (ld1) | i2/i4 (ld2) | i5/i6 (ld2/3) |
| 1949 + @ o2: i2/i3/i4/i5/i6/i7 -> i2/i4 (ld2) | i3/i5 (ld2) | i6/i7 (ld3) |
| 1950 + @ o3: i3/i4/i5/i6/i7/i8 -> i3/i5 (ld2) | i4/i6 (ld2/3) | i7/i8 (ld3) |
| 1951 + ldr r7, [r2, #5] @ ld3 -> src[5-8] |
| 1952 + ldr r6, [r2, #2] @ ld2 -> src[2-5] |
| 1953 + ldr r5, [r2], #4 @ ld1 -> src[0-3] |
| 1954 + |
| 1955 + pkhtb r7, r7, r7, asr #8 @ src[8,7,7,6] |
| 1956 + uxtb16 r9, r6, ror #8 @ src[5] | src[3] |
| 1957 + uxtb16 r6, r6 @ src[4] | src[2] |
| 1958 + uxtb16 r8, r5, ror #8 @ src[3] | src[1] |
| 1959 + uxtb16 r11, r7, ror #8 @ src[8] | src[7] |
| 1960 + uxtb16 r7, r7 @ src[7] | src[6] |
| 1961 + pkhtb r10, r9, r6, asr #16 @ src[5] | src[4] |
| 1962 + uxtb16 r5, r5 @ src[2] | src[0] |
| 1963 + |
| 1964 + smuad r11, r11, lr @ filter[3][2] -> r11 |
| 1965 + subs r4, r4, #1 |
| 1966 + pkhbt r12, r10, r7, lsl #16 @ src[6] | src[4] |
| 1967 + smuad r7, r7, lr @ filter[2][2] -> r7 |
| 1968 + smuad r5, r5, r1 @ filter[0][0] -> r5 |
| 1969 + smlad r11, r9, r1, r11 @ filter[3][0] -> r11 |
| 1970 + smlad r7, r9, r3, r7 @ filter[2][1] -> r7 |
| 1971 + smuad r9, r8, r1 @ filter[1][0] -> r9 |
| 1972 + smlad r5, r8, r3, r5 @ filter[0][1] -> r5 |
| 1973 + pkhtb r8, r12, r10, asr #16 @ src[6] | src[5] |
| 1974 + smlad r11, r12, r3, r11 @ filter[3][1] -> r11 |
| 1975 + smlad r9, r6, r3, r9 @ filter[1][1] -> r9 |
| 1976 + smlad r5, r10, lr, r5 @ filter[0][2] -> r5 |
| 1977 + smlad r7, r6, r1, r7 @ filter[2][0] -> r7 |
| 1978 + smlad r9, r8, lr, r9 @ filter[1][2] -> r9 |
| 1979 + |
| 1980 + add r5, r5, #0x40 @ round_shift_and_clamp[0] |
| 1981 + add r9, r9, #0x40 @ round_shift_and_clamp[1] |
| 1982 + add r7, r7, #0x40 @ round_shift_and_clamp[2] |
| 1983 + add r11, r11, #0x40 @ round_shift_and_clamp[3] |
| 1984 + |
| 1985 + usat r5, #8, r5, asr #7 |
| 1986 + usat r9, #8, r9, asr #7 |
| 1987 + usat r7, #8, r7, asr #7 |
| 1988 + usat r11, #8, r11, asr #7 |
| 1989 + |
| 1990 + strb r5, [r0], #1 @ store res[0] |
| 1991 + strb r9, [r0], #1 @ store res[1] |
| 1992 + strb r7, [r0], #1 @ store res[2] |
| 1993 + strb r11,[r0], #1 @ store res[3] |
| 1994 + |
| 1995 + bne 1b |
| 1996 + |
| 1997 + ldr r12,[sp, #44] @ height = outer-loop counter |
| 1998 + subs r12, r12, #1 |
| 1999 +T itttt ne |
| 2000 + ldrne r4, [sp, #40] @ 4-in-parallel loop counter |
| 2001 + ldrne r5, [sp, #48] |
| 2002 + ldrne r6, [sp] |
| 2003 + strne r12,[sp, #44] |
| 2004 + add r2, r2, r5 @ move to next input/output lin
es |
| 2005 + add r0, r0, r6 |
| 2006 + |
| 2007 + bne 1b |
| 2008 + |
| 2009 + add sp, sp, #4 @ restore stack after push{r1}
above |
| 2010 + pop {r4 - r11, pc} |
| 2011 +endfunc |
| 2012 + |
| 2013 +@ void put_vp8_epel_v6(uint8_t *dst, int dststride, uint8_t *src, |
| 2014 +@ int srcstride, int w, int h, int my) |
| 2015 +function ff_put_vp8_epel_v6_armv6, export=1 |
| 2016 + push {r4 - r11, lr} |
| 2017 + |
| 2018 + movrel lr, sixtap_filters_13245600 - 16 |
| 2019 + ldr r12,[sp, #44] @ vp8_filter index |
| 2020 + ldr r4, [sp, #36] @ width |
| 2021 + add lr, lr, r12, lsl #3 |
| 2022 + sub r1, r1, r4 @ dst_stride - block_width |
| 2023 + lsr r4, #2 |
| 2024 + |
| 2025 + str r4, [sp, #36] @ "4-in-parallel" loop counter
@40 |
| 2026 + str r3, [sp, #44] @ src_stride - block_width @48 |
| 2027 + push {r1} @ dst_stride - block_width @0 |
| 2028 + @ height @44 |
| 2029 +1: |
| 2030 + add r1, r3, r3, lsl #1 @ stride * 3 |
| 2031 + ldr_dpren r5, r2, r3 @ src[0,1,2,3 + stride * 1] |
| 2032 + ldr r6, [r2, r3] @ src[0,1,2,3 + stride * 3] |
| 2033 + ldr r7, [r2, r3, lsl #1] @ src[0,1,2,3 + stride * 4] |
| 2034 + ldr r8, [r2, r1] @ src[0,1,2,3 + stride * 5] |
| 2035 + |
| 2036 + @ byte -> word and "transpose" |
| 2037 + uxtb16 r9, r5, ror #8 @ src[3 + stride*1] | src[1 + s
tride*1] |
| 2038 + uxtb16 r10, r6, ror #8 @ src[3 + stride*3] | src[1 + s
tride*3] |
| 2039 + uxtb16 r11, r7, ror #8 @ src[3 + stride*4] | src[1 + s
tride*4] |
| 2040 + uxtb16 r12, r8, ror #8 @ src[3 + stride*5] | src[1 + s
tride*5] |
| 2041 + uxtb16 r5, r5 @ src[2 + stride*1] | src[0 + s
tride*1] |
| 2042 + uxtb16 r6, r6 @ src[2 + stride*3] | src[0 + s
tride*3] |
| 2043 + uxtb16 r7, r7 @ src[2 + stride*4] | src[0 + s
tride*4] |
| 2044 + uxtb16 r8, r8 @ src[2 + stride*5] | src[0 + s
tride*5] |
| 2045 + pkhbt r1, r9, r10, lsl #16 @ src[1 + stride*3] | src[1 + s
tride*1] |
| 2046 + pkhtb r9, r10, r9, asr #16 @ src[3 + stride*3] | src[3 + s
tride*1] |
| 2047 + pkhbt r10, r11, r12, lsl #16 @ src[1 + stride*5] | src[1 + s
tride*4] |
| 2048 + pkhtb r11, r12, r11, asr #16 @ src[3 + stride*5] | src[3 + s
tride*4] |
| 2049 + pkhbt r12, r5, r6, lsl #16 @ src[0 + stride*3] | src[0 + s
tride*1] |
| 2050 + pkhtb r5, r6, r5, asr #16 @ src[2 + stride*3] | src[2 + s
tride*1] |
| 2051 + pkhbt r6, r7, r8, lsl #16 @ src[0 + stride*5] | src[0 + s
tride*4] |
| 2052 + pkhtb r7, r8, r7, asr #16 @ src[2 + stride*5] | src[2 + s
tride*4] |
| 2053 + |
| 2054 + ldr r8, [lr, #4] @ stall - if only I had more re
gisters... |
| 2055 + smuad r12, r12, r8 @ filter[0][1] |
| 2056 + smuad r1, r1, r8 @ filter[1][1] |
| 2057 + smuad r5, r5, r8 @ filter[2][1] |
| 2058 + smuad r9, r9, r8 @ filter[3][1] |
| 2059 + ldr r8, [lr, #8] @ stall - if only I had more re
gisters... |
| 2060 + smlad r12, r6, r8, r12 @ filter[0][2] |
| 2061 + smlad r1, r10, r8, r1 @ filter[1][2] |
| 2062 + ldr_dpren r6, r2, r3, lsl #1 @ src[0,1,2,3 + stride * 0] |
| 2063 + ldr r10,[r2], #4 @ src[0,1,2,3 + stride * 2] |
| 2064 + smlad r5, r7, r8, r5 @ filter[2][2] |
| 2065 + smlad r9, r11, r8, r9 @ filter[3][2] |
| 2066 + |
| 2067 + uxtb16 r7, r6, ror #8 @ src[3 + stride*0] | src[1 + s
tride*0] |
| 2068 + uxtb16 r11, r10, ror #8 @ src[3 + stride*2] | src[1 + s
tride*2] |
| 2069 + uxtb16 r6, r6 @ src[2 + stride*0] | src[0 + s
tride*0] |
| 2070 + uxtb16 r10, r10 @ src[2 + stride*2] | src[0 + s
tride*2] |
| 2071 + |
| 2072 + pkhbt r8, r7, r11, lsl #16 @ src[1 + stride*2] | src[1 + s
tride*0] |
| 2073 + pkhtb r7, r11, r7, asr #16 @ src[3 + stride*2] | src[3 + s
tride*0] |
| 2074 + pkhbt r11, r6, r10, lsl #16 @ src[0 + stride*2] | src[0 + s
tride*0] |
| 2075 + pkhtb r6, r10, r6, asr #16 @ src[2 + stride*2] | src[2 + s
tride*0] |
| 2076 + |
| 2077 + ldr r10,[lr] @ stall - if only I had more re
gisters... |
| 2078 + subs r4, r4, #1 @ counter-- |
| 2079 + smlad r12, r11, r10, r12 @ filter[0][0] |
| 2080 + smlad r1, r8, r10, r1 @ filter[1][0] |
| 2081 + smlad r5, r6, r10, r5 @ filter[2][0] |
| 2082 + smlad r9, r7, r10, r9 @ filter[3][0] |
| 2083 + |
| 2084 + add r12, r12, #0x40 @ round_shift_and_clamp[0] |
| 2085 + add r1, r1, #0x40 @ round_shift_and_clamp[1] |
| 2086 + add r5, r5, #0x40 @ round_shift_and_clamp[2] |
| 2087 + add r9, r9, #0x40 @ round_shift_and_clamp[3] |
| 2088 + |
| 2089 + usat r12, #8, r12, asr #7 |
| 2090 + usat r1, #8, r1, asr #7 |
| 2091 + usat r5, #8, r5, asr #7 |
| 2092 + usat r9, #8, r9, asr #7 |
| 2093 + |
| 2094 + strb r12,[r0], #1 @ store res[0] |
| 2095 + strb r1, [r0], #1 @ store res[1] |
| 2096 + strb r5, [r0], #1 @ store res[2] |
| 2097 + strb r9, [r0], #1 @ store res[3] |
| 2098 + |
| 2099 + bne 1b |
| 2100 + |
| 2101 + ldr r12,[sp, #44] @ height = outer-loop counter |
| 2102 + subs r12, r12, #1 |
| 2103 +T itttt ne |
| 2104 + ldrne r4, [sp, #40] @ 4-in-parallel loop counter |
| 2105 + ldrne r6, [sp, #0] |
| 2106 + subne r2, r2, r4, lsl #2 |
| 2107 + strne r12,[sp, #44] |
| 2108 + add r0, r0, r6 |
| 2109 + add r2, r2, r3 @ move to next input/output lin
es |
| 2110 + |
| 2111 + bne 1b |
| 2112 + |
| 2113 + add sp, sp, #4 @ restore stack after push{r1}
above |
| 2114 + pop {r4 - r11, pc} |
| 2115 +endfunc |
| 2116 + |
| 2117 +@ void put_vp8_epel_h4(uint8_t *dst, int dststride, uint8_t *src, |
| 2118 +@ int srcstride, int w, int h, int mx) |
| 2119 +function ff_put_vp8_epel_h4_armv6, export=1 |
| 2120 + push {r4 - r11, lr} |
| 2121 + |
| 2122 + subs r2, r2, #1 |
| 2123 + movrel lr, fourtap_filters_1324 - 4 |
| 2124 + ldr r4, [sp, #36] @ width |
| 2125 + ldr r12,[sp, #44] @ vp8_filter index |
| 2126 + add lr, lr, r12, lsl #2 |
| 2127 + sub r3, r3, r4 @ src_stride - block_width |
| 2128 + sub r1, r1, r4 @ dst_stride - block_width |
| 2129 + ldr r5, [lr] |
| 2130 + ldr r6, [lr, #4] |
| 2131 + asr r4, #2 |
| 2132 + |
| 2133 + ldr lr, [sp, #40] @ height = outer-loop counter |
| 2134 + str r4, [sp, #36] @ "4-in-parallel" inner loop co
unter |
| 2135 +1: |
| 2136 + @ 3 loads, 5 uxtb16s and then mul/acc/add/shr |
| 2137 + @ o0: i0/i1/i2/i3 -> i0/i2(ld1) + i1/i3(ld1) |
| 2138 + @ o1: i1/i2/i3/i4 -> i1/i3(ld1) + i2/i4(ld2) |
| 2139 + @ o2: i2/i3/i4/i5 -> i2/i4(ld2) + i3/i5(ld2) |
| 2140 + @ o3: i3/i4/i5/i6 -> i3/i5(ld2) + i4/i6(ld3) |
| 2141 + ldr r9, [r2, #3] @ load source data |
| 2142 + ldr r8, [r2, #2] |
| 2143 + ldr r7, [r2], #4 |
| 2144 + |
| 2145 + uxtb16 r9, r9, ror #8 @ src[6] | src[4] |
| 2146 + uxtb16 r10, r8, ror #8 @ src[5] | src[3] |
| 2147 + uxtb16 r8, r8 @ src[4] | src[2] |
| 2148 + uxtb16 r11, r7, ror #8 @ src[3] | src[1] |
| 2149 + uxtb16 r7, r7 @ src[2] | src[0] |
| 2150 + |
| 2151 + smuad r9, r9, r6 @ filter[3][1] -> r9 |
| 2152 + smuad r12, r10, r6 @ filter[2][1] -> r12 |
| 2153 + smuad r7, r7, r5 @ filter[0][0] -> r7 |
| 2154 + smlad r9, r10, r5, r9 @ filter[3][0] -> r9 |
| 2155 + smuad r10, r11, r5 @ filter[1][0] -> r10 |
| 2156 + smlad r12, r8, r5, r12 @ filter[2][0] -> r12 |
| 2157 + smlad r7, r11, r6, r7 @ filter[0][1] -> r7 |
| 2158 + smlad r10, r8, r6, r10 @ filter[1][1] -> r10 |
| 2159 + |
| 2160 + subs r4, r4, #1 @ counter-- |
| 2161 + |
| 2162 + add r7, r7, #0x40 @ round_shift_and_clamp[0] |
| 2163 + add r10, r10, #0x40 @ round_shift_and_clamp[1] |
| 2164 + add r12, r12, #0x40 @ round_shift_and_clamp[2] |
| 2165 + add r9, r9, #0x40 @ round_shift_and_clamp[3] |
| 2166 + |
| 2167 + usat r7, #8, r7, asr #7 |
| 2168 + usat r10, #8, r10, asr #7 |
| 2169 + usat r12, #8, r12, asr #7 |
| 2170 + usat r9, #8, r9, asr #7 |
| 2171 + |
| 2172 + strb r7, [r0], #1 @ store res[0] |
| 2173 + strb r10,[r0], #1 @ store res[1] |
| 2174 + strb r12,[r0], #1 @ store res[2] |
| 2175 + strb r9, [r0], #1 @ store res[3] |
| 2176 + |
| 2177 + bne 1b |
| 2178 + |
| 2179 + subs lr, lr, #1 |
| 2180 +T it ne |
| 2181 + ldrne r4, [sp, #36] @ 4-in-parallel loop counter |
| 2182 + add r2, r2, r3 @ move to next input/output lin
es |
| 2183 + add r0, r0, r1 |
| 2184 + |
| 2185 + bne 1b |
| 2186 + |
| 2187 + pop {r4 - r11, pc} |
| 2188 +endfunc |
| 2189 + |
| 2190 +@ void put_vp8_epel_v4(uint8_t *dst, int dststride, uint8_t *src, |
| 2191 +@ int srcstride, int w, int h, int my) |
| 2192 +function ff_put_vp8_epel_v4_armv6, export=1 |
| 2193 + push {r4 - r11, lr} |
| 2194 + |
| 2195 + movrel lr, fourtap_filters_1324 - 4 |
| 2196 + ldr r12,[sp, #44] @ vp8_filter index |
| 2197 + ldr r4, [sp, #36] @ width |
| 2198 + add lr, lr, r12, lsl #2 |
| 2199 + sub r1, r1, r4 @ dst_stride - block_width |
| 2200 + asr r4, #2 |
| 2201 + ldr r5, [lr] |
| 2202 + ldr r6, [lr, #4] |
| 2203 + |
| 2204 + str r4, [sp, #36] @ "4-in-parallel" loop counter
@40 |
| 2205 + str r3, [sp, #44] @ src_stride @48 |
| 2206 + push {r1} @ dst_stride - block_width @36 |
| 2207 + @ height @44 |
| 2208 +1: |
| 2209 + ldr lr, [r2, r3, lsl #1] @ load source pixels |
| 2210 + ldr r12,[r2, r3] |
| 2211 + ldr_dpren r7, r2, r3 |
| 2212 + ldr r11,[r2], #4 |
| 2213 + |
| 2214 + @ byte -> word and "transpose" |
| 2215 + uxtb16 r8, lr, ror #8 @ src[3 + stride*3] | src[1 + s
tride*3] |
| 2216 + uxtb16 r9, r12, ror #8 @ src[3 + stride*2] | src[1 + s
tride*2] |
| 2217 + uxtb16 r3, r7, ror #8 @ src[3 + stride*0] | src[1 + s
tride*0] |
| 2218 + uxtb16 r1, r11, ror #8 @ src[3 + stride*1] | src[1 + s
tride*1] |
| 2219 + uxtb16 lr, lr @ src[2 + stride*3] | src[0 + s
tride*3] |
| 2220 + uxtb16 r12, r12 @ src[2 + stride*2] | src[0 + s
tride*2] |
| 2221 + uxtb16 r7, r7 @ src[2 + stride*0] | src[0 + s
tride*0] |
| 2222 + uxtb16 r11, r11 @ src[2 + stride*1] | src[0 + s
tride*1] |
| 2223 + pkhbt r10, r1, r8, lsl #16 @ src[1 + stride*3] | src[1 + s
tride*1] |
| 2224 + pkhtb r1, r8, r1, asr #16 @ src[3 + stride*3] | src[3 + s
tride*1] |
| 2225 + pkhbt r8, r3, r9, lsl #16 @ src[1 + stride*2] | src[1 + s
tride*0] |
| 2226 + pkhtb r3, r9, r3, asr #16 @ src[3 + stride*2] | src[3 + s
tride*0] |
| 2227 + pkhbt r9, r11, lr, lsl #16 @ src[0 + stride*3] | src[0 + s
tride*1] |
| 2228 + pkhtb r11, lr, r11, asr #16 @ src[2 + stride*3] | src[2 + s
tride*1] |
| 2229 + pkhbt lr, r7, r12, lsl #16 @ src[0 + stride*2] | src[0 + s
tride*0] |
| 2230 + pkhtb r7, r12, r7, asr #16 @ src[2 + stride*2] | src[2 + s
tride*0] |
| 2231 + |
| 2232 + smuad r9, r9, r6 @ filter[0][1] |
| 2233 + smuad r10, r10, r6 @ filter[1][1] |
| 2234 + smuad r11, r11, r6 @ filter[2][1] |
| 2235 + smuad r1, r1, r6 @ filter[3][1] |
| 2236 + smlad r9, lr, r5, r9 @ filter[0][0] |
| 2237 + smlad r10, r8, r5, r10 @ filter[1][0] |
| 2238 + smlad r11, r7, r5, r11 @ filter[2][0] |
| 2239 + smlad r1, r3, r5, r1 @ filter[3][0] |
| 2240 + |
| 2241 + subs r4, r4, #1 @ counter-- |
| 2242 + ldr r3, [sp, #48] @ FIXME prevent clobber of r3 a
bove? |
| 2243 + |
| 2244 + add r9, r9, #0x40 @ round_shift_and_clamp[0] |
| 2245 + add r10, r10, #0x40 @ round_shift_and_clamp[1] |
| 2246 + add r11, r11, #0x40 @ round_shift_and_clamp[2] |
| 2247 + add r1, r1, #0x40 @ round_shift_and_clamp[3] |
| 2248 + |
| 2249 + usat r9, #8, r9, asr #7 |
| 2250 + usat r10, #8, r10, asr #7 |
| 2251 + usat r11, #8, r11, asr #7 |
| 2252 + usat r1, #8, r1, asr #7 |
| 2253 + |
| 2254 + strb r9, [r0], #1 @ store result |
| 2255 + strb r10,[r0], #1 |
| 2256 + strb r11,[r0], #1 |
| 2257 + strb r1, [r0], #1 |
| 2258 + |
| 2259 + bne 1b |
| 2260 + |
| 2261 + ldr r12,[sp, #44] @ height = outer-loop counter |
| 2262 + subs r12, r12, #1 |
| 2263 +T ittt ne |
| 2264 + ldrne r4, [sp, #40] @ 4-in-parallel loop counter |
| 2265 + ldrne r9, [sp, #0] |
| 2266 + strne r12,[sp, #44] |
| 2267 + sub r2, r2, r4, lsl #2 |
| 2268 + add r0, r0, r9 |
| 2269 + add r2, r2, r3 @ move to next input/output lin
es |
| 2270 + |
| 2271 + bne 1b |
| 2272 + |
| 2273 + add sp, sp, #4 @ restore stack after push{r1}
above |
| 2274 + pop {r4 - r11, pc} |
| 2275 +endfunc |
| 2276 + |
| 2277 +@ void put_vp8_bilin_h(uint8_t *dst, int dststride, uint8_t *src, |
| 2278 +@ int srcstride, int w, int h, int mx) |
| 2279 +function ff_put_vp8_bilin_h_armv6, export=1 |
| 2280 + push {r4 - r9, lr} |
| 2281 + |
| 2282 + ldr r8, [sp, #36] @ vp8_filter index |
| 2283 + ldr r12,[sp, #32] @ height = outer-loop counter |
| 2284 + ldr r4, [sp, #28] @ width |
| 2285 + lsl r5, r8, #16 @ mx << 16 |
| 2286 + sub r3, r3, r4 @ src_stride - block_width |
| 2287 + sub r1, r1, r4 @ dst_stride - block_width |
| 2288 + asr r4, #2 |
| 2289 + sub r5, r5, r8 @ (mx << 16) | (-mx) |
| 2290 + str r4, [sp, #28] @ "4-in-parallel" loop counter |
| 2291 + add r5, r5, #8 @ (8 - mx) | (mx << 16) = filte
r coefficients |
| 2292 +1: |
| 2293 + ldrb r6, [r2], #1 @ load source data |
| 2294 + ldrb r7, [r2], #1 |
| 2295 + ldrb r8, [r2], #1 |
| 2296 + ldrb r9, [r2], #1 |
| 2297 + ldrb lr, [r2] |
| 2298 + |
| 2299 + pkhbt r6, r6, r7, lsl #16 @ src[1] | src[0] |
| 2300 + pkhbt r7, r7, r8, lsl #16 @ src[2] | src[1] |
| 2301 + pkhbt r8, r8, r9, lsl #16 @ src[3] | src[2] |
| 2302 + pkhbt r9, r9, lr, lsl #16 @ src[4] | src[3] |
| 2303 + |
| 2304 + smuad r6, r6, r5 @ apply the filter |
| 2305 + smuad r7, r7, r5 |
| 2306 + smuad r8, r8, r5 |
| 2307 + smuad r9, r9, r5 |
| 2308 + |
| 2309 + subs r4, r4, #1 @ counter-- |
| 2310 + |
| 2311 + add r6, r6, #0x4 @ round_shift_and_clamp |
| 2312 + add r7, r7, #0x4 |
| 2313 + add r8, r8, #0x4 |
| 2314 + add r9, r9, #0x4 |
| 2315 + |
| 2316 + asr r6, #3 |
| 2317 + asr r7, #3 |
| 2318 + pkhbt r6, r6, r8, lsl #13 |
| 2319 + pkhbt r7, r7, r9, lsl #13 |
| 2320 + orr r6, r6, r7, lsl #8 |
| 2321 + str r6, [r0], #4 @ store result |
| 2322 + |
| 2323 + bne 1b |
| 2324 + |
| 2325 + ldr r4, [sp, #28] @ 4-in-parallel loop counter |
| 2326 + subs r12, r12, #1 |
| 2327 + |
| 2328 + add r2, r2, r3 @ move to next input/output lin
es |
| 2329 + add r0, r0, r1 |
| 2330 + |
| 2331 + bne 1b |
| 2332 + |
| 2333 + pop {r4 - r9, pc} |
| 2334 +endfunc |
| 2335 + |
| 2336 +@ void put_vp8_bilin_v(uint8_t *dst, int dststride, uint8_t *src, |
| 2337 +@ int srcstride, int w, int h, int my) |
| 2338 +function ff_put_vp8_bilin_v_armv6, export=1 |
| 2339 + push {r4 - r11, lr} |
| 2340 + |
| 2341 + ldr r11,[sp, #44] @ vp8_filter index |
| 2342 + ldr r4, [sp, #36] @ width |
| 2343 + mov r5, r11, lsl #16 @ mx << 16 |
| 2344 + ldr r12,[sp, #40] @ height = outer-loop counter |
| 2345 + sub r1, r1, r4 |
| 2346 + sub r5, r5, r11 @ (mx << 16) | (-mx) |
| 2347 + asr r4, #2 |
| 2348 + add r5, r5, #8 @ (8 - mx) | (mx << 16) = filte
r coefficients |
| 2349 + str r4, [sp, #36] @ "4-in-parallel" loop counter |
| 2350 +1: |
| 2351 + ldrb r10,[r2, r3] @ load the data |
| 2352 + ldrb r6, [r2], #1 |
| 2353 + ldrb r11,[r2, r3] |
| 2354 + ldrb r7, [r2], #1 |
| 2355 + ldrb lr, [r2, r3] |
| 2356 + ldrb r8, [r2], #1 |
| 2357 + ldrb r9, [r2, r3] |
| 2358 + pkhbt r6, r6, r10, lsl #16 |
| 2359 + ldrb r10,[r2], #1 |
| 2360 + pkhbt r7, r7, r11, lsl #16 |
| 2361 + pkhbt r8, r8, lr, lsl #16 |
| 2362 + pkhbt r9, r10, r9, lsl #16 |
| 2363 + |
| 2364 + smuad r6, r6, r5 @ apply the filter |
| 2365 + smuad r7, r7, r5 |
| 2366 + smuad r8, r8, r5 |
| 2367 + smuad r9, r9, r5 |
| 2368 + |
| 2369 + subs r4, r4, #1 @ counter-- |
| 2370 + |
| 2371 + add r6, r6, #0x4 @ round_shift_and_clamp |
| 2372 + add r7, r7, #0x4 |
| 2373 + add r8, r8, #0x4 |
| 2374 + add r9, r9, #0x4 |
| 2375 + |
| 2376 + asr r6, #3 |
| 2377 + asr r7, #3 |
| 2378 + pkhbt r6, r6, r8, lsl #13 |
| 2379 + pkhbt r7, r7, r9, lsl #13 |
| 2380 + orr r6, r6, r7, lsl #8 |
| 2381 + str r6, [r0], #4 @ store result |
| 2382 + |
| 2383 + bne 1b |
| 2384 + |
| 2385 + ldr r4, [sp, #36] @ 4-in-parallel loop counter |
| 2386 + subs r12, r12, #1 |
| 2387 + |
| 2388 + add r2, r2, r3 @ move to next input/output lin
es |
| 2389 + add r0, r0, r1 |
| 2390 + sub r2, r2, r4, lsl #2 |
| 2391 + |
| 2392 + bne 1b |
| 2393 + pop {r4 - r11, pc} |
| 2394 +endfunc |
| 2395 diff --git a/libavcodec/arm/vp8dsp_init_arm.c b/libavcodec/arm/vp8dsp_init_arm.c |
| 2396 index 269c6e3..74d9581 100644 |
| 2397 --- a/libavcodec/arm/vp8dsp_init_arm.c |
| 2398 +++ b/libavcodec/arm/vp8dsp_init_arm.c |
| 2399 @@ -19,13 +19,17 @@ |
| 2400 #include <stdint.h> |
| 2401 #include "libavcodec/vp8dsp.h" |
| 2402 |
| 2403 -void ff_vp8_luma_dc_wht_neon(DCTELEM block[4][4][16], DCTELEM dc[16]); |
| 2404 -void ff_vp8_luma_dc_wht_dc_neon(DCTELEM block[4][4][16], DCTELEM dc[16]); |
| 2405 +void ff_vp8_luma_dc_wht_dc_armv6(DCTELEM block[4][4][16], DCTELEM dc[16]); |
| 2406 |
| 2407 -void ff_vp8_idct_add_neon(uint8_t *dst, DCTELEM block[16], int stride); |
| 2408 -void ff_vp8_idct_dc_add_neon(uint8_t *dst, DCTELEM block[16], int stride); |
| 2409 -void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, DCTELEM block[4][16], int stride); |
| 2410 -void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, DCTELEM block[4][16], int stride)
; |
| 2411 +#define idct_funcs(opt) \ |
| 2412 +void ff_vp8_luma_dc_wht_ ## opt(DCTELEM block[4][4][16], DCTELEM dc[16]); \ |
| 2413 +void ff_vp8_idct_add_ ## opt(uint8_t *dst, DCTELEM block[16], int stride); \ |
| 2414 +void ff_vp8_idct_dc_add_ ## opt(uint8_t *dst, DCTELEM block[16], int stride); \ |
| 2415 +void ff_vp8_idct_dc_add4y_ ## opt(uint8_t *dst, DCTELEM block[4][16], int strid
e); \ |
| 2416 +void ff_vp8_idct_dc_add4uv_ ## opt(uint8_t *dst, DCTELEM block[4][16], int stri
de) |
| 2417 + |
| 2418 +idct_funcs(neon); |
| 2419 +idct_funcs(armv6); |
| 2420 |
| 2421 void ff_vp8_v_loop_filter16_neon(uint8_t *dst, int stride, |
| 2422 int flim_E, int flim_I, int hev_thresh); |
| 2423 @@ -47,29 +51,106 @@ void ff_vp8_h_loop_filter8uv_inner_neon(uint8_t *dstU, uint
8_t *dstV, |
| 2424 int stride, int flim_E, int flim_I, |
| 2425 int hev_thresh); |
| 2426 |
| 2427 -void ff_vp8_v_loop_filter16_simple_neon(uint8_t *dst, int stride, int flim); |
| 2428 -void ff_vp8_h_loop_filter16_simple_neon(uint8_t *dst, int stride, int flim); |
| 2429 +void ff_vp8_v_loop_filter_inner_armv6(uint8_t *dst, int stride, |
| 2430 + int flim_E, int flim_I, |
| 2431 + int hev_thresh, int count); |
| 2432 +void ff_vp8_h_loop_filter_inner_armv6(uint8_t *dst, int stride, |
| 2433 + int flim_E, int flim_I, |
| 2434 + int hev_thresh, int count); |
| 2435 +void ff_vp8_v_loop_filter_armv6(uint8_t *dst, int stride, |
| 2436 + int flim_E, int flim_I, |
| 2437 + int hev_thresh, int count); |
| 2438 +void ff_vp8_h_loop_filter_armv6(uint8_t *dst, int stride, |
| 2439 + int flim_E, int flim_I, |
| 2440 + int hev_thresh, int count); |
| 2441 |
| 2442 +static void ff_vp8_v_loop_filter16_armv6(uint8_t *dst, int stride, |
| 2443 + int flim_E, int flim_I, int hev_thresh
) |
| 2444 +{ |
| 2445 + ff_vp8_v_loop_filter_armv6(dst, stride, flim_E, flim_I, hev_thresh, 4); |
| 2446 +} |
| 2447 + |
| 2448 +static void ff_vp8_h_loop_filter16_armv6(uint8_t *dst, int stride, |
| 2449 + int flim_E, int flim_I, int hev_thresh
) |
| 2450 +{ |
| 2451 + ff_vp8_h_loop_filter_armv6(dst, stride, flim_E, flim_I, hev_thresh, 4); |
| 2452 +} |
| 2453 |
| 2454 -#define VP8_MC(n) \ |
| 2455 - void ff_put_vp8_##n##_neon(uint8_t *dst, int dststride, \ |
| 2456 - uint8_t *src, int srcstride, \ |
| 2457 - int h, int x, int y) |
| 2458 +static void ff_vp8_v_loop_filter8uv_armv6(uint8_t *dstU, uint8_t *dstV, int str
ide, |
| 2459 + int flim_E, int flim_I, int hev_thres
h) |
| 2460 +{ |
| 2461 + ff_vp8_v_loop_filter_armv6(dstU, stride, flim_E, flim_I, hev_thresh, 2); |
| 2462 + ff_vp8_v_loop_filter_armv6(dstV, stride, flim_E, flim_I, hev_thresh, 2); |
| 2463 +} |
| 2464 + |
| 2465 +static void ff_vp8_h_loop_filter8uv_armv6(uint8_t *dstU, uint8_t *dstV, int str
ide, |
| 2466 + int flim_E, int flim_I, int hev_thres
h) |
| 2467 +{ |
| 2468 + ff_vp8_h_loop_filter_armv6(dstU, stride, flim_E, flim_I, hev_thresh, 2); |
| 2469 + ff_vp8_h_loop_filter_armv6(dstV, stride, flim_E, flim_I, hev_thresh, 2); |
| 2470 +} |
| 2471 + |
| 2472 +static void ff_vp8_v_loop_filter16_inner_armv6(uint8_t *dst, int stride, |
| 2473 + int flim_E, int flim_I, int hev_
thresh) |
| 2474 +{ |
| 2475 + ff_vp8_v_loop_filter_inner_armv6(dst, stride, flim_E, flim_I, hev_thresh, 4
); |
| 2476 +} |
| 2477 + |
| 2478 +static void ff_vp8_h_loop_filter16_inner_armv6(uint8_t *dst, int stride, |
| 2479 + int flim_E, int flim_I, int hev_
thresh) |
| 2480 +{ |
| 2481 + ff_vp8_h_loop_filter_inner_armv6(dst, stride, flim_E, flim_I, hev_thresh, 4
); |
| 2482 +} |
| 2483 + |
| 2484 +static void ff_vp8_v_loop_filter8uv_inner_armv6(uint8_t *dstU, uint8_t *dstV, |
| 2485 + int stride, int flim_E, int fli
m_I, |
| 2486 + int hev_thresh) |
| 2487 +{ |
| 2488 + ff_vp8_v_loop_filter_inner_armv6(dstU, stride, flim_E, flim_I, hev_thresh,
2); |
| 2489 + ff_vp8_v_loop_filter_inner_armv6(dstV, stride, flim_E, flim_I, hev_thresh,
2); |
| 2490 +} |
| 2491 + |
| 2492 +static void ff_vp8_h_loop_filter8uv_inner_armv6(uint8_t *dstU, uint8_t *dstV, |
| 2493 + int stride, int flim_E, int fli
m_I, |
| 2494 + int hev_thresh) |
| 2495 +{ |
| 2496 + ff_vp8_h_loop_filter_inner_armv6(dstU, stride, flim_E, flim_I, hev_thresh,
2); |
| 2497 + ff_vp8_h_loop_filter_inner_armv6(dstV, stride, flim_E, flim_I, hev_thresh,
2); |
| 2498 +} |
| 2499 + |
| 2500 +#define simple_lf_funcs(opt) \ |
| 2501 +void ff_vp8_v_loop_filter16_simple_ ## opt(uint8_t *dst, int stride, int flim);
\ |
| 2502 +void ff_vp8_h_loop_filter16_simple_ ## opt(uint8_t *dst, int stride, int flim) |
| 2503 + |
| 2504 +simple_lf_funcs(neon); |
| 2505 +simple_lf_funcs(armv6); |
| 2506 + |
| 2507 +#define VP8_MC_OPT(n, opt) \ |
| 2508 + void ff_put_vp8_##n##_##opt(uint8_t *dst, int dststride, \ |
| 2509 + uint8_t *src, int srcstride, \ |
| 2510 + int h, int x, int y) |
| 2511 + |
| 2512 +#define VP8_MC(n) \ |
| 2513 + VP8_MC_OPT(n, neon) |
| 2514 |
| 2515 #define VP8_EPEL(w) \ |
| 2516 - VP8_MC(pixels ## w); \ |
| 2517 VP8_MC(epel ## w ## _h4); \ |
| 2518 VP8_MC(epel ## w ## _h6); \ |
| 2519 - VP8_MC(epel ## w ## _v4); \ |
| 2520 VP8_MC(epel ## w ## _h4v4); \ |
| 2521 VP8_MC(epel ## w ## _h6v4); \ |
| 2522 + VP8_MC(epel ## w ## _v4); \ |
| 2523 VP8_MC(epel ## w ## _v6); \ |
| 2524 VP8_MC(epel ## w ## _h4v6); \ |
| 2525 VP8_MC(epel ## w ## _h6v6) |
| 2526 |
| 2527 VP8_EPEL(16); |
| 2528 +VP8_MC(pixels16); |
| 2529 +VP8_MC_OPT(pixels16, armv6); |
| 2530 VP8_EPEL(8); |
| 2531 +VP8_MC(pixels8); |
| 2532 +VP8_MC_OPT(pixels8, armv6); |
| 2533 VP8_EPEL(4); |
| 2534 +VP8_MC_OPT(pixels4, armv6); |
| 2535 |
| 2536 VP8_MC(bilin16_h); |
| 2537 VP8_MC(bilin16_v); |
| 2538 @@ -81,83 +162,148 @@ VP8_MC(bilin4_h); |
| 2539 VP8_MC(bilin4_v); |
| 2540 VP8_MC(bilin4_hv); |
| 2541 |
| 2542 +#define VP8_V6_MC(n) \ |
| 2543 +void ff_put_vp8_##n##_armv6(uint8_t *dst, int dststride, uint8_t *src, \ |
| 2544 + int srcstride, int w, int h, int mxy) |
| 2545 + |
| 2546 +VP8_V6_MC(epel_v6); |
| 2547 +VP8_V6_MC(epel_h6); |
| 2548 +VP8_V6_MC(epel_v4); |
| 2549 +VP8_V6_MC(epel_h4); |
| 2550 +VP8_V6_MC(bilin_v); |
| 2551 +VP8_V6_MC(bilin_h); |
| 2552 + |
| 2553 +#define VP8_EPEL_HV(SIZE, TAPNUMX, TAPNUMY, NAME, HNAME, VNAME, MAXHEIGHT) \ |
| 2554 +static void ff_put_vp8_##NAME##SIZE##_##HNAME##VNAME##_armv6( \ |
| 2555 + uint8_t *dst, int dststride, uint8_t *s
rc, \ |
| 2556 + int srcstride, int h, int mx, int my) \ |
| 2557 +{ \ |
| 2558 + DECLARE_ALIGNED(4, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \ |
| 2559 + uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \ |
| 2560 + src -= srcstride * (TAPNUMY / 2 - 1); \ |
| 2561 + ff_put_vp8_ ## NAME ## _ ## HNAME ## _armv6(tmp, SIZE, src, srcstri
de, \ |
| 2562 + SIZE, h + TAPNUMY - 1, mx); \ |
| 2563 + ff_put_vp8_ ## NAME ## _ ## VNAME ## _armv6(dst, dststride, tmpptr, SIZE, \ |
| 2564 + SIZE, h, my); \ |
| 2565 +} |
| 2566 + |
| 2567 +VP8_EPEL_HV(16, 6, 6, epel, h6, v6, 16); |
| 2568 +VP8_EPEL_HV(16, 2, 2, bilin, h, v, 16); |
| 2569 +VP8_EPEL_HV(8, 6, 6, epel, h6, v6, 16); |
| 2570 +VP8_EPEL_HV(8, 4, 6, epel, h4, v6, 16); |
| 2571 +VP8_EPEL_HV(8, 6, 4, epel, h6, v4, 16); |
| 2572 +VP8_EPEL_HV(8, 4, 4, epel, h4, v4, 16); |
| 2573 +VP8_EPEL_HV(8, 2, 2, bilin, h, v, 16); |
| 2574 +VP8_EPEL_HV(4, 6, 6, epel, h6, v6, 8); |
| 2575 +VP8_EPEL_HV(4, 4, 6, epel, h4, v6, 8); |
| 2576 +VP8_EPEL_HV(4, 6, 4, epel, h6, v4, 8); |
| 2577 +VP8_EPEL_HV(4, 4, 4, epel, h4, v4, 8); |
| 2578 +VP8_EPEL_HV(4, 2, 2, bilin, h, v, 8); |
| 2579 + |
| 2580 +extern void put_vp8_epel4_v6_c(uint8_t *dst, int d, uint8_t *src, int s, int h,
int mx, int my); |
| 2581 +#undef printf |
| 2582 +#define VP8_EPEL_H_OR_V(SIZE, NAME, HV) \ |
| 2583 +static void ff_put_vp8_##NAME##SIZE##_##HV##_armv6( \ |
| 2584 + uint8_t *dst, int dststride, uint8_t *s
rc, \ |
| 2585 + int srcstride, int h, int mx, int my) \ |
| 2586 +{ \ |
| 2587 + ff_put_vp8_## NAME ## _ ## HV ## _armv6(dst, dststride, src, srcstride, \ |
| 2588 + SIZE, h, mx | my); \ |
| 2589 +} |
| 2590 + |
| 2591 +VP8_EPEL_H_OR_V(4, epel, h6); |
| 2592 +VP8_EPEL_H_OR_V(4, epel, h4); |
| 2593 +VP8_EPEL_H_OR_V(4, epel, v6); |
| 2594 +VP8_EPEL_H_OR_V(4, epel, v4); |
| 2595 +VP8_EPEL_H_OR_V(4, bilin, v); |
| 2596 +VP8_EPEL_H_OR_V(4, bilin, h); |
| 2597 +VP8_EPEL_H_OR_V(8, epel, h6); |
| 2598 +VP8_EPEL_H_OR_V(8, epel, h4); |
| 2599 +VP8_EPEL_H_OR_V(8, epel, v6); |
| 2600 +VP8_EPEL_H_OR_V(8, epel, v4); |
| 2601 +VP8_EPEL_H_OR_V(8, bilin, v); |
| 2602 +VP8_EPEL_H_OR_V(8, bilin, h); |
| 2603 +VP8_EPEL_H_OR_V(16, epel, h6); |
| 2604 +VP8_EPEL_H_OR_V(16, epel, v6); |
| 2605 +VP8_EPEL_H_OR_V(16, bilin, v); |
| 2606 +VP8_EPEL_H_OR_V(16, bilin, h); |
| 2607 + |
| 2608 av_cold void ff_vp8dsp_init_arm(VP8DSPContext *dsp) |
| 2609 { |
| 2610 +#define set_func_ptrs(opt) \ |
| 2611 + dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_##opt; \ |
| 2612 + dsp->vp8_luma_dc_wht_dc = ff_vp8_luma_dc_wht_dc_armv6; \ |
| 2613 + \ |
| 2614 + dsp->vp8_idct_add = ff_vp8_idct_add_##opt; \ |
| 2615 + dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_##opt; \ |
| 2616 + dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_##opt; \ |
| 2617 + dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_##opt; \ |
| 2618 + \ |
| 2619 + dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_##opt; \ |
| 2620 + dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_##opt; \ |
| 2621 + dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_##opt; \ |
| 2622 + dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_##opt; \ |
| 2623 + \ |
| 2624 + dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_##opt; \ |
| 2625 + dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_##opt; \ |
| 2626 + dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_##opt;
\ |
| 2627 + dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_##opt;
\ |
| 2628 + \ |
| 2629 + dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_##opt; \ |
| 2630 + dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_##opt; \ |
| 2631 + \ |
| 2632 + dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_##opt; \ |
| 2633 + dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_##opt; \ |
| 2634 + dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_##opt; \ |
| 2635 + dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_##opt; \ |
| 2636 + \ |
| 2637 + dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_##opt; \ |
| 2638 + dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_##opt; \ |
| 2639 + dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_##opt; \ |
| 2640 + dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_##opt; \ |
| 2641 + dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_##opt; \ |
| 2642 + dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_##opt; \ |
| 2643 + dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_##opt; \ |
| 2644 + dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_##opt; \ |
| 2645 + dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_##opt; \ |
| 2646 + \ |
| 2647 + dsp->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_armv6; \ |
| 2648 + dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_##opt; \ |
| 2649 + dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_##opt; \ |
| 2650 + dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_##opt; \ |
| 2651 + dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_##opt; \ |
| 2652 + dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_##opt; \ |
| 2653 + dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_##opt; \ |
| 2654 + dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_##opt; \ |
| 2655 + dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_##opt; \ |
| 2656 + \ |
| 2657 + dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_##opt;
\ |
| 2658 + dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_##opt;
\ |
| 2659 + dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_##opt;
\ |
| 2660 + dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_##opt
; \ |
| 2661 + \ |
| 2662 + dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_##opt; \ |
| 2663 + dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_##opt;
\ |
| 2664 + dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_##opt;
\ |
| 2665 + dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_##opt;
\ |
| 2666 + dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_##opt;
\ |
| 2667 + dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_##opt;
\ |
| 2668 + dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_##opt;
\ |
| 2669 + dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_##opt;
\ |
| 2670 + dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_##opt;
\ |
| 2671 + \ |
| 2672 + dsp->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_armv6; \ |
| 2673 + dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_##opt;
\ |
| 2674 + dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_##opt;
\ |
| 2675 + dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_##opt;
\ |
| 2676 + dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_##opt;
\ |
| 2677 + dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_##opt;
\ |
| 2678 + dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_##opt;
\ |
| 2679 + dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_##opt;
\ |
| 2680 + dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_##opt |
| 2681 if (HAVE_NEON) { |
| 2682 - dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_neon; |
| 2683 - dsp->vp8_luma_dc_wht_dc = ff_vp8_luma_dc_wht_dc_neon; |
| 2684 - |
| 2685 - dsp->vp8_idct_add = ff_vp8_idct_add_neon; |
| 2686 - dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon; |
| 2687 - dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon; |
| 2688 - dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon; |
| 2689 - |
| 2690 - dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon; |
| 2691 - dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon; |
| 2692 - dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon; |
| 2693 - dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon; |
| 2694 - |
| 2695 - dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_neon; |
| 2696 - dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_neon; |
| 2697 - dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_neon; |
| 2698 - dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_neon; |
| 2699 - |
| 2700 - dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_neon; |
| 2701 - dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_neon; |
| 2702 - |
| 2703 - dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon; |
| 2704 - dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_neon; |
| 2705 - dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_neon; |
| 2706 - dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon; |
| 2707 - |
| 2708 - dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon; |
| 2709 - dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_neon; |
| 2710 - dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_neon; |
| 2711 - dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_neon; |
| 2712 - dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon; |
| 2713 - dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon; |
| 2714 - dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_neon; |
| 2715 - dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon; |
| 2716 - dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon; |
| 2717 - |
| 2718 - dsp->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_neon; |
| 2719 - dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_neon; |
| 2720 - dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_neon; |
| 2721 - dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_neon; |
| 2722 - dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_neon; |
| 2723 - dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_neon; |
| 2724 - dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon; |
| 2725 - dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon; |
| 2726 - dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon; |
| 2727 - |
| 2728 - dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon; |
| 2729 - dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_neon; |
| 2730 - dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_neon; |
| 2731 - dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_neon; |
| 2732 - dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_neon; |
| 2733 - dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_neon; |
| 2734 - dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_neon; |
| 2735 - dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_neon; |
| 2736 - dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_neon; |
| 2737 - |
| 2738 - dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon; |
| 2739 - dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_neon; |
| 2740 - dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_neon; |
| 2741 - dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_neon; |
| 2742 - dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_neon; |
| 2743 - dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_neon; |
| 2744 - dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_neon; |
| 2745 - dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_neon; |
| 2746 - dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_neon; |
| 2747 - |
| 2748 - dsp->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_neon; |
| 2749 - dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_neon; |
| 2750 - dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_neon; |
| 2751 - dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_neon; |
| 2752 - dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_neon; |
| 2753 - dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_neon; |
| 2754 - dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_neon; |
| 2755 - dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_neon; |
| 2756 - dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_neon; |
| 2757 + set_func_ptrs(neon); |
| 2758 + } else if (HAVE_ARMV6) { |
| 2759 + set_func_ptrs(armv6); |
| 2760 } |
| 2761 } |
| 2762 diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S |
| 2763 index 1b9f24e..8e79982 100644 |
| 2764 --- a/libavcodec/arm/vp8dsp_neon.S |
| 2765 +++ b/libavcodec/arm/vp8dsp_neon.S |
| 2766 @@ -76,18 +76,6 @@ function ff_vp8_luma_dc_wht_neon, export=1 |
| 2767 bx lr |
| 2768 endfunc |
| 2769 |
| 2770 -function ff_vp8_luma_dc_wht_dc_neon, export=1 |
| 2771 - ldrsh r2, [r1] |
| 2772 - mov r3, #0 |
| 2773 - add r2, r2, #3 |
| 2774 - strh r3, [r1] |
| 2775 - asr r2, r2, #3 |
| 2776 - .rept 16 |
| 2777 - strh r2, [r0], #32 |
| 2778 - .endr |
| 2779 - bx lr |
| 2780 -endfunc |
| 2781 - |
| 2782 function ff_vp8_idct_add_neon, export=1 |
| 2783 vld1.16 {q0-q1}, [r1,:128] |
| 2784 movw r3, #20091 |
| 2785 @@ -741,23 +729,6 @@ function ff_put_vp8_pixels8_neon, export=1 |
| 2786 bx lr |
| 2787 endfunc |
| 2788 |
| 2789 -function ff_put_vp8_pixels4_neon, export=1 |
| 2790 - ldr r12, [sp, #0] @ h |
| 2791 - push {r4-r6,lr} |
| 2792 -1: |
| 2793 - subs r12, r12, #4 |
| 2794 - ldr_post r4, r2, r3 |
| 2795 - ldr_post r5, r2, r3 |
| 2796 - ldr_post r6, r2, r3 |
| 2797 - ldr_post lr, r2, r3 |
| 2798 - str_post r4, r0, r1 |
| 2799 - str_post r5, r0, r1 |
| 2800 - str_post r6, r0, r1 |
| 2801 - str_post lr, r0, r1 |
| 2802 - bgt 1b |
| 2803 - pop {r4-r6,pc} |
| 2804 -endfunc |
| 2805 - |
| 2806 /* 4/6-tap 8th-pel MC */ |
| 2807 |
| 2808 .macro vp8_epel8_h6 d, a, b |
| 2809 -- |
| 2810 1.7.5.4 |
| 2811 |
| OLD | NEW |