src/opts/SkBitmapProcState_filter_neon.h - Issue 21915004: ARM Skia NEON patches - 19 - Intrinsics version of the Filter32 routines

Side by Side Diff: src/opts/SkBitmapProcState_filter_neon.h

Issue 21915004: ARM Skia NEON patches - 19 - Intrinsics version of the Filter32 routines (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Created 7 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1	1

2 /*	2 /*

3 * Copyright 2012 The Android Open Source Project	3 * Copyright 2012 The Android Open Source Project

4 *	4 *

5 * Use of this source code is governed by a BSD-style license that can be	5 * Use of this source code is governed by a BSD-style license that can be

6 * found in the LICENSE file.	6 * found in the LICENSE file.

7 */	7 */

8	8

9	9

	10 #include <arm_neon.h>

10 #include "SkColorPriv.h"	11 #include "SkColorPriv.h"

11	12

12 /*	13 /*

13 Filter_32_opaque	14 * Filter_32_opaque

14	15 *

15 There is no hard-n-fast rule that the filtering must produce	16 * There is no hard-n-fast rule that the filtering must produce

16 exact results for the color components, but if the 4 incoming colors are	17 * exact results for the color components, but if the 4 incoming colors are

17 all opaque, then the output color must also be opaque. Subsequent parts of	18 * all opaque, then the output color must also be opaque. Subsequent parts of

18 the drawing pipeline may rely on this (e.g. which blitrow proc to use).	19 * the drawing pipeline may rely on this (e.g. which blitrow proc to use).

19 */	20 */

20	21

21 static inline void Filter_32_opaque_neon(unsigned x, unsigned y,	22 static inline void Filter_32_opaque_neon(unsigned x, unsigned y,

22 SkPMColor a00, SkPMColor a01,	23 SkPMColor a00, SkPMColor a01,

23 SkPMColor a10, SkPMColor a11,	24 SkPMColor a10, SkPMColor a11,

24 SkPMColor *dst) {	25 SkPMColor *dst) {

25 asm volatile(	26 uint8x8_t vy, vconst16_8, v16_y, vres;

26 "vdup.8 d0, %[y] \n\t" // duplicate y into d0	27 uint16x4_t vx, vconst16_16, v16_x, tmp;

27 "vmov.u8 d16, #16 \n\t" // set up const ant in d16	28 uint32x2_t va0, va1;

28 "vsub.u8 d1, d16, d0 \n\t" // d1 = 16-y	29 uint16x8_t tmp1, tmp2;

29	30

30 "vdup.32 d4, %[a00] \n\t" // duplicate a0 0 into d4	31 vy = vdup_n_u8(y); // duplicate y into vy

31 "vdup.32 d5, %[a10] \n\t" // duplicate a1 0 into d5	32 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8

32 "vmov.32 d4[1], %[a01] \n\t" // set top of d 4 to a01	33 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y

33 "vmov.32 d5[1], %[a11] \n\t" // set top of d 5 to a11

34	34

35 "vmull.u8 q3, d4, d1 \n\t" // q3 = [a01\|a0 0] * (16-y)	35 va0 = vdup_n_u32(a00); // duplicate a00

36 "vmull.u8 q0, d5, d0 \n\t" // q0 = [a11\|a1 0] * y	36 va1 = vdup_n_u32(a10); // duplicate a10

	37 va0 = vset_lane_u32(a01, va0, 1); // set top to a01

	38 va1 = vset_lane_u32(a11, va1, 1); // set top to a11

37	39

38 "vdup.16 d5, %[x] \n\t" // duplicate x into d5	40 tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01\|a00] * (16- y)

39 "vmov.u16 d16, #16 \n\t" // set up const ant in d16	41 tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11\|a10] * y

40 "vsub.u16 d3, d16, d5 \n\t" // d3 = 16-x

41	42

42 "vmul.i16 d4, d7, d5 \n\t" // d4 = a01 * x	43 vx = vdup_n_u16(x); // duplicate x into vx

43 "vmla.i16 d4, d1, d5 \n\t" // d4 += a11 * x	44 vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16

44 "vmla.i16 d4, d6, d3 \n\t" // d4 += a00 * (16-x)	45 v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x

45 "vmla.i16 d4, d0, d3 \n\t" // d4 += a10 * (16-x)	46

46 "vshrn.i16 d0, q2, #8 \n\t" // shift down r esult by 8	47 tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x

47 "vst1.32 {d0[0]}, [%[dst]] \n\t" // store result	48 tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x

48 :	49 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)

49 : [x] "r" (x), [y] "r" (y), [a00] "r" (a00), [a01] "r" (a01), [ a10] "r" (a10), [a11] "r" (a11), [dst] "r" (dst)	50 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)

50 : "cc", "memory", "d0", "d1", "d3", "d4", "d5", "d6", "d7", "d1 6"	51

51 );	52 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down resu lt by 8

	53 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result

52 }	54 }

53	55

54 static inline void Filter_32_alpha_neon(unsigned x, unsigned y,	56 static inline void Filter_32_alpha_neon(unsigned x, unsigned y,

55 SkPMColor a00, SkPMColor a01,	57 SkPMColor a00, SkPMColor a01,

56 SkPMColor a10, SkPMColor a11,	58 SkPMColor a10, SkPMColor a11,

57 SkPMColor *dst, uint16_t scale) {	59 SkPMColor *dst, uint16_t scale) {

58 asm volatile(	60 uint8x8_t vy, vconst16_8, v16_y, vres;

59 "vdup.8 d0, %[y] \n\t" // duplicate y into d0	61 uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;

60 "vmov.u8 d16, #16 \n\t" // set up const ant in d16	62 uint32x2_t va0, va1;

61 "vsub.u8 d1, d16, d0 \n\t" // d1 = 16-y	63 uint16x8_t tmp1, tmp2;

62	64

63 "vdup.32 d4, %[a00] \n\t" // duplicate a0 0 into d4	65 vy = vdup_n_u8(y); // duplicate y into vy

64 "vdup.32 d5, %[a10] \n\t" // duplicate a1 0 into d5	66 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8

65 "vmov.32 d4[1], %[a01] \n\t" // set top of d 4 to a01	67 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y

66 "vmov.32 d5[1], %[a11] \n\t" // set top of d 5 to a11

67	68

68 "vmull.u8 q3, d4, d1 \n\t" // q3 = [a01\|a0 0] * (16-y)	69 va0 = vdup_n_u32(a00); // duplicate a00

69 "vmull.u8 q0, d5, d0 \n\t" // q0 = [a11\|a1 0] * y	70 va1 = vdup_n_u32(a10); // duplicate a10

	71 va0 = vset_lane_u32(a01, va0, 1); // set top to a01

	72 va1 = vset_lane_u32(a11, va1, 1); // set top to a11

70	73

71 "vdup.16 d5, %[x] \n\t" // duplicate x into d5	74 tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01\|a00] * (16- y)

72 "vmov.u16 d16, #16 \n\t" // set up const ant in d16	75 tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11\|a10] * y

73 "vsub.u16 d3, d16, d5 \n\t" // d3 = 16-x

74	76

75 "vmul.i16 d4, d7, d5 \n\t" // d4 = a01 * x	77 vx = vdup_n_u16(x); // duplicate x into vx

76 "vmla.i16 d4, d1, d5 \n\t" // d4 += a11 * x	78 vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16

77 "vmla.i16 d4, d6, d3 \n\t" // d4 += a00 * (16-x)	79 v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x

78 "vmla.i16 d4, d0, d3 \n\t" // d4 += a10 * (16-x)	80

79 "vdup.16 d3, %[scale] \n\t" // duplicate sc ale into d3	81 tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x

80 "vshr.u16 d4, d4, #8 \n\t" // shift down r esult by 8	82 tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x

81 "vmul.i16 d4, d4, d3 \n\t" // multiply res ult by scale	83 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)

82 "vshrn.i16 d0, q2, #8 \n\t" // shift down r esult by 8	84 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)

83 "vst1.32 {d0[0]}, [%[dst]] \n\t" // store result	85

84 :	86 vscale = vdup_n_u16(scale); // duplicate scale

85 : [x] "r" (x), [y] "r" (y), [a00] "r" (a00), [a01] "r" (a01), [ a10] "r" (a10), [a11] "r" (a11), [dst] "r" (dst), [scale] "r" (scale)	87 tmp = vshr_n_u16(tmp, 8); // shift down result by 8

86 : "cc", "memory", "d0", "d1", "d3", "d4", "d5", "d6", "d7", "d1 6"	88 tmp = vmul_u16(tmp, vscale); // multiply result by scale

87 );	89

	90 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down resu lt by 8

	91 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result

88 }	92 }

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »