| OLD | NEW |
| 1 | 1 |
| 2 /* | 2 /* |
| 3 * Copyright 2012 The Android Open Source Project | 3 * Copyright 2012 The Android Open Source Project |
| 4 * | 4 * |
| 5 * Use of this source code is governed by a BSD-style license that can be | 5 * Use of this source code is governed by a BSD-style license that can be |
| 6 * found in the LICENSE file. | 6 * found in the LICENSE file. |
| 7 */ | 7 */ |
| 8 | 8 |
| 9 | 9 |
| 10 #include <arm_neon.h> |
| 10 #include "SkColorPriv.h" | 11 #include "SkColorPriv.h" |
| 11 | 12 |
| 12 /* | 13 /* |
| 13 Filter_32_opaque | 14 * Filter_32_opaque |
| 14 | 15 * |
| 15 There is no hard-n-fast rule that the filtering must produce | 16 * There is no hard-n-fast rule that the filtering must produce |
| 16 exact results for the color components, but if the 4 incoming colors are | 17 * exact results for the color components, but if the 4 incoming colors are |
| 17 all opaque, then the output color must also be opaque. Subsequent parts of | 18 * all opaque, then the output color must also be opaque. Subsequent parts of |
| 18 the drawing pipeline may rely on this (e.g. which blitrow proc to use). | 19 * the drawing pipeline may rely on this (e.g. which blitrow proc to use). |
| 19 */ | 20 */ |
| 20 | 21 |
| 21 static inline void Filter_32_opaque_neon(unsigned x, unsigned y, | 22 static inline void Filter_32_opaque_neon(unsigned x, unsigned y, |
| 22 SkPMColor a00, SkPMColor a01, | 23 SkPMColor a00, SkPMColor a01, |
| 23 SkPMColor a10, SkPMColor a11, | 24 SkPMColor a10, SkPMColor a11, |
| 24 SkPMColor *dst) { | 25 SkPMColor *dst) { |
| 25 asm volatile( | 26 uint8x8_t vy, vconst16_8, v16_y, vres; |
| 26 "vdup.8 d0, %[y] \n\t" // duplicate y
into d0 | 27 uint16x4_t vx, vconst16_16, v16_x, tmp; |
| 27 "vmov.u8 d16, #16 \n\t" // set up const
ant in d16 | 28 uint32x2_t va0, va1; |
| 28 "vsub.u8 d1, d16, d0 \n\t" // d1 = 16-y | 29 uint16x8_t tmp1, tmp2; |
| 29 | 30 |
| 30 "vdup.32 d4, %[a00] \n\t" // duplicate a0
0 into d4 | 31 vy = vdup_n_u8(y); // duplicate y into vy |
| 31 "vdup.32 d5, %[a10] \n\t" // duplicate a1
0 into d5 | 32 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 |
| 32 "vmov.32 d4[1], %[a01] \n\t" // set top of d
4 to a01 | 33 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y |
| 33 "vmov.32 d5[1], %[a11] \n\t" // set top of d
5 to a11 | |
| 34 | 34 |
| 35 "vmull.u8 q3, d4, d1 \n\t" // q3 = [a01|a0
0] * (16-y) | 35 va0 = vdup_n_u32(a00); // duplicate a00 |
| 36 "vmull.u8 q0, d5, d0 \n\t" // q0 = [a11|a1
0] * y | 36 va1 = vdup_n_u32(a10); // duplicate a10 |
| 37 va0 = vset_lane_u32(a01, va0, 1); // set top to a01 |
| 38 va1 = vset_lane_u32(a11, va1, 1); // set top to a11 |
| 37 | 39 |
| 38 "vdup.16 d5, %[x] \n\t" // duplicate x
into d5 | 40 tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-
y) |
| 39 "vmov.u16 d16, #16 \n\t" // set up const
ant in d16 | 41 tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y |
| 40 "vsub.u16 d3, d16, d5 \n\t" // d3 = 16-x | |
| 41 | 42 |
| 42 "vmul.i16 d4, d7, d5 \n\t" // d4 = a01 *
x | 43 vx = vdup_n_u16(x); // duplicate x into vx |
| 43 "vmla.i16 d4, d1, d5 \n\t" // d4 += a11 *
x | 44 vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16 |
| 44 "vmla.i16 d4, d6, d3 \n\t" // d4 += a00 *
(16-x) | 45 v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x |
| 45 "vmla.i16 d4, d0, d3 \n\t" // d4 += a10 *
(16-x) | 46 |
| 46 "vshrn.i16 d0, q2, #8 \n\t" // shift down r
esult by 8 | 47 tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x |
| 47 "vst1.32 {d0[0]}, [%[dst]] \n\t" // store result | 48 tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x |
| 48 : | 49 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) |
| 49 : [x] "r" (x), [y] "r" (y), [a00] "r" (a00), [a01] "r" (a01), [
a10] "r" (a10), [a11] "r" (a11), [dst] "r" (dst) | 50 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) |
| 50 : "cc", "memory", "d0", "d1", "d3", "d4", "d5", "d6", "d7", "d1
6" | 51 |
| 51 ); | 52 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down resu
lt by 8 |
| 53 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result |
| 52 } | 54 } |
| 53 | 55 |
| 54 static inline void Filter_32_alpha_neon(unsigned x, unsigned y, | 56 static inline void Filter_32_alpha_neon(unsigned x, unsigned y, |
| 55 SkPMColor a00, SkPMColor a01, | 57 SkPMColor a00, SkPMColor a01, |
| 56 SkPMColor a10, SkPMColor a11, | 58 SkPMColor a10, SkPMColor a11, |
| 57 SkPMColor *dst, uint16_t scale) { | 59 SkPMColor *dst, uint16_t scale) { |
| 58 asm volatile( | 60 uint8x8_t vy, vconst16_8, v16_y, vres; |
| 59 "vdup.8 d0, %[y] \n\t" // duplicate y
into d0 | 61 uint16x4_t vx, vconst16_16, v16_x, tmp, vscale; |
| 60 "vmov.u8 d16, #16 \n\t" // set up const
ant in d16 | 62 uint32x2_t va0, va1; |
| 61 "vsub.u8 d1, d16, d0 \n\t" // d1 = 16-y | 63 uint16x8_t tmp1, tmp2; |
| 62 | 64 |
| 63 "vdup.32 d4, %[a00] \n\t" // duplicate a0
0 into d4 | 65 vy = vdup_n_u8(y); // duplicate y into vy |
| 64 "vdup.32 d5, %[a10] \n\t" // duplicate a1
0 into d5 | 66 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 |
| 65 "vmov.32 d4[1], %[a01] \n\t" // set top of d
4 to a01 | 67 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y |
| 66 "vmov.32 d5[1], %[a11] \n\t" // set top of d
5 to a11 | |
| 67 | 68 |
| 68 "vmull.u8 q3, d4, d1 \n\t" // q3 = [a01|a0
0] * (16-y) | 69 va0 = vdup_n_u32(a00); // duplicate a00 |
| 69 "vmull.u8 q0, d5, d0 \n\t" // q0 = [a11|a1
0] * y | 70 va1 = vdup_n_u32(a10); // duplicate a10 |
| 71 va0 = vset_lane_u32(a01, va0, 1); // set top to a01 |
| 72 va1 = vset_lane_u32(a11, va1, 1); // set top to a11 |
| 70 | 73 |
| 71 "vdup.16 d5, %[x] \n\t" // duplicate x
into d5 | 74 tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-
y) |
| 72 "vmov.u16 d16, #16 \n\t" // set up const
ant in d16 | 75 tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y |
| 73 "vsub.u16 d3, d16, d5 \n\t" // d3 = 16-x | |
| 74 | 76 |
| 75 "vmul.i16 d4, d7, d5 \n\t" // d4 = a01 *
x | 77 vx = vdup_n_u16(x); // duplicate x into vx |
| 76 "vmla.i16 d4, d1, d5 \n\t" // d4 += a11 *
x | 78 vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16 |
| 77 "vmla.i16 d4, d6, d3 \n\t" // d4 += a00 *
(16-x) | 79 v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x |
| 78 "vmla.i16 d4, d0, d3 \n\t" // d4 += a10 *
(16-x) | 80 |
| 79 "vdup.16 d3, %[scale] \n\t" // duplicate sc
ale into d3 | 81 tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x |
| 80 "vshr.u16 d4, d4, #8 \n\t" // shift down r
esult by 8 | 82 tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x |
| 81 "vmul.i16 d4, d4, d3 \n\t" // multiply res
ult by scale | 83 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) |
| 82 "vshrn.i16 d0, q2, #8 \n\t" // shift down r
esult by 8 | 84 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) |
| 83 "vst1.32 {d0[0]}, [%[dst]] \n\t" // store result | 85 |
| 84 : | 86 vscale = vdup_n_u16(scale); // duplicate scale |
| 85 : [x] "r" (x), [y] "r" (y), [a00] "r" (a00), [a01] "r" (a01), [
a10] "r" (a10), [a11] "r" (a11), [dst] "r" (dst), [scale] "r" (scale) | 87 tmp = vshr_n_u16(tmp, 8); // shift down result by 8 |
| 86 : "cc", "memory", "d0", "d1", "d3", "d4", "d5", "d6", "d7", "d1
6" | 88 tmp = vmul_u16(tmp, vscale); // multiply result by scale |
| 87 ); | 89 |
| 90 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down resu
lt by 8 |
| 91 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result |
| 88 } | 92 } |
| OLD | NEW |