OLD | NEW |
1 /* NEON optimized code (C) COPYRIGHT 2009 Motorola | 1 /* NEON optimized code (C) COPYRIGHT 2009 Motorola |
2 * | 2 * |
3 * Use of this source code is governed by a BSD-style license that can be | 3 * Use of this source code is governed by a BSD-style license that can be |
4 * found in the LICENSE file. | 4 * found in the LICENSE file. |
5 */ | 5 */ |
6 | 6 |
7 #include "SkBitmapProcState.h" | 7 #include "SkBitmapProcState.h" |
8 #include "SkPerspIter.h" | 8 #include "SkPerspIter.h" |
9 #include "SkShader.h" | 9 #include "SkShader.h" |
10 #include "SkUtilsArm.h" | 10 #include "SkUtilsArm.h" |
| 11 #include "SkBitmapProcState_utils.h" |
11 | 12 |
12 extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[]; | 13 extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[]; |
13 extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[]; | 14 extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[]; |
14 | 15 |
15 static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, in
t count); | 16 static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, in
t count); |
16 static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int
count); | 17 static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int
count); |
17 | 18 |
18 static unsigned SK_USHIFT16(unsigned x) { | |
19 return x >> 16; | |
20 } | |
21 | |
22 #define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon | 19 #define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon |
23 #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) | 20 #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) |
24 #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) | 21 #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) |
25 #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) | 22 #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) |
26 #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) | 23 #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) |
27 #define CHECK_FOR_DECAL | 24 #define CHECK_FOR_DECAL |
28 #include "SkBitmapProcState_matrix_clamp_neon.h" | 25 #include "SkBitmapProcState_matrix_clamp_neon.h" |
29 | 26 |
30 #define MAKENAME(suffix) RepeatX_RepeatY ## suffix ## _neon | 27 #define MAKENAME(suffix) RepeatX_RepeatY ## suffix ## _neon |
31 #define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1)) | 28 #define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1)) |
32 #define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1)) | 29 #define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1)) |
33 #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) | 30 #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) |
34 #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) | 31 #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) |
35 #include "SkBitmapProcState_matrix_repeat_neon.h" | 32 #include "SkBitmapProcState_matrix_repeat_neon.h" |
36 | 33 |
37 | 34 |
38 void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count
) | |
39 { | |
40 int i; | |
41 | 35 |
| 36 void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count
) { |
42 if (count >= 8) { | 37 if (count >= 8) { |
43 /* SkFixed is 16.16 fixed point */ | 38 // SkFixed is 16.16 fixed point |
44 SkFixed dx2 = dx+dx; | 39 SkFixed dx8 = dx * 8; |
45 SkFixed dx4 = dx2+dx2; | 40 int32x4_t vdx8 = vdupq_n_s32(dx8); |
46 SkFixed dx8 = dx4+dx4; | |
47 | 41 |
48 /* now build fx/fx+dx/fx+2dx/fx+3dx */ | 42 // setup lbase and hbase |
49 SkFixed fx1, fx2, fx3; | |
50 int32x4_t lbase, hbase; | 43 int32x4_t lbase, hbase; |
51 uint16_t *dst16 = (uint16_t *)dst; | 44 lbase = vdupq_n_s32(fx); |
| 45 lbase = vsetq_lane_s32(fx + dx, lbase, 1); |
| 46 lbase = vsetq_lane_s32(fx + dx + dx, lbase, 2); |
| 47 lbase = vsetq_lane_s32(fx + dx + dx + dx, lbase, 3); |
| 48 hbase = lbase + vdupq_n_s32(4 * dx); |
52 | 49 |
53 fx1 = fx+dx; | 50 do { |
54 fx2 = fx1+dx; | 51 // store the upper 16 bits |
55 fx3 = fx2+dx; | 52 vst1q_u32(dst, vreinterpretq_u32_s16( |
| 53 vuzpq_s16(vreinterpretq_s16_s32(lbase), vreinterpretq_s16_s32(hb
ase)).val[1] |
| 54 )); |
56 | 55 |
57 /* avoid an 'lbase unitialized' warning */ | 56 // on to the next group of 8 |
58 lbase = vdupq_n_s32(fx); | 57 lbase += vdx8; |
59 lbase = vsetq_lane_s32(fx1, lbase, 1); | 58 hbase += vdx8; |
60 lbase = vsetq_lane_s32(fx2, lbase, 2); | 59 dst += 4; // we did 8 elements but the result is twice smaller |
61 lbase = vsetq_lane_s32(fx3, lbase, 3); | |
62 hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); | |
63 | |
64 /* take upper 16 of each, store, and bump everything */ | |
65 do { | |
66 int32x4_t lout, hout; | |
67 uint16x8_t hi16; | |
68 | |
69 lout = lbase; | |
70 hout = hbase; | |
71 /* gets hi's of all louts then hi's of all houts */ | |
72 asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout)); | |
73 hi16 = vreinterpretq_u16_s32(hout); | |
74 vst1q_u16(dst16, hi16); | |
75 | |
76 /* on to the next */ | |
77 lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); | |
78 hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); | |
79 dst16 += 8; | |
80 count -= 8; | 60 count -= 8; |
81 fx += dx8; | 61 fx += dx8; |
82 } while (count >= 8); | 62 } while (count >= 8); |
83 dst = (uint32_t *) dst16; | |
84 } | 63 } |
85 | 64 |
86 uint16_t* xx = (uint16_t*)dst; | 65 uint16_t* xx = (uint16_t*)dst; |
87 for (i = count; i > 0; --i) { | 66 for (int i = count; i > 0; --i) { |
88 *xx++ = SkToU16(fx >> 16); fx += dx; | 67 *xx++ = SkToU16(fx >> 16); fx += dx; |
89 } | 68 } |
90 } | 69 } |
91 | 70 |
92 void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) | 71 void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
{ |
93 { | |
94 if (count >= 8) { | 72 if (count >= 8) { |
95 int32x4_t wide_fx; | 73 SkFixed dx8 = dx * 8; |
96 int32x4_t wide_fx2; | 74 int32x4_t vdx8 = vdupq_n_s32(dx8); |
97 int32x4_t wide_dx8 = vdupq_n_s32(dx*8); | |
98 | 75 |
| 76 int32x4_t wide_fx, wide_fx2; |
99 wide_fx = vdupq_n_s32(fx); | 77 wide_fx = vdupq_n_s32(fx); |
100 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); | 78 wide_fx = vsetq_lane_s32(fx + dx, wide_fx, 1); |
101 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); | 79 wide_fx = vsetq_lane_s32(fx + dx + dx, wide_fx, 2); |
102 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); | 80 wide_fx = vsetq_lane_s32(fx + dx + dx + dx, wide_fx, 3); |
103 | 81 |
104 wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx)); | 82 wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(4 * dx)); |
105 | 83 |
106 while (count >= 8) { | 84 while (count >= 8) { |
107 int32x4_t wide_out; | 85 int32x4_t wide_out; |
108 int32x4_t wide_out2; | 86 int32x4_t wide_out2; |
109 | 87 |
110 wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14); | 88 wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14); |
111 wide_out = vorrq_s32(wide_out, | 89 wide_out = wide_out | (vshrq_n_s32(wide_fx,16) + vdupq_n_s32(1)); |
112 vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1))); | |
113 | 90 |
114 wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14); | 91 wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14); |
115 wide_out2 = vorrq_s32(wide_out2, | 92 wide_out2 = wide_out2 | (vshrq_n_s32(wide_fx2,16) + vdupq_n_s32(1)); |
116 vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1))); | |
117 | 93 |
118 vst1q_u32(dst, vreinterpretq_u32_s32(wide_out)); | 94 vst1q_u32(dst, vreinterpretq_u32_s32(wide_out)); |
119 vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2)); | 95 vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2)); |
120 | 96 |
121 dst += 8; | 97 dst += 8; |
122 fx += dx*8; | 98 fx += dx8; |
123 wide_fx = vaddq_s32(wide_fx, wide_dx8); | 99 wide_fx += vdx8; |
124 wide_fx2 = vaddq_s32(wide_fx2, wide_dx8); | 100 wide_fx2 += vdx8; |
125 count -= 8; | 101 count -= 8; |
126 } | 102 } |
127 } | 103 } |
128 | 104 |
129 if (count & 1) | 105 if (count & 1) |
130 { | 106 { |
131 SkASSERT((fx >> (16 + 14)) == 0); | 107 SkASSERT((fx >> (16 + 14)) == 0); |
132 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); | 108 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); |
133 fx += dx; | 109 fx += dx; |
134 } | 110 } |
135 while ((count -= 2) >= 0) | 111 while ((count -= 2) >= 0) |
136 { | 112 { |
137 SkASSERT((fx >> (16 + 14)) == 0); | 113 SkASSERT((fx >> (16 + 14)) == 0); |
138 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); | 114 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); |
139 fx += dx; | 115 fx += dx; |
140 | 116 |
141 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); | 117 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); |
142 fx += dx; | 118 fx += dx; |
143 } | 119 } |
144 } | 120 } |
OLD | NEW |