Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(176)

Side by Side Diff: src/opts/SkBitmapProcState_matrixProcs_neon.cpp

Issue 21931002: ARM Skia NEON patches - 18 - Preparation work for BitmapProcState (Closed) Base URL: https://skia.googlecode.com/svn/trunk
Patch Set: Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/core/SkBitmapProcState_utils.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* NEON optimized code (C) COPYRIGHT 2009 Motorola 1 /* NEON optimized code (C) COPYRIGHT 2009 Motorola
2 * 2 *
3 * Use of this source code is governed by a BSD-style license that can be 3 * Use of this source code is governed by a BSD-style license that can be
4 * found in the LICENSE file. 4 * found in the LICENSE file.
5 */ 5 */
6 6
7 #include "SkBitmapProcState.h" 7 #include "SkBitmapProcState.h"
8 #include "SkPerspIter.h" 8 #include "SkPerspIter.h"
9 #include "SkShader.h" 9 #include "SkShader.h"
10 #include "SkUtilsArm.h" 10 #include "SkUtilsArm.h"
11 #include "SkBitmapProcState_utils.h"
11 12
12 extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[]; 13 extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[];
13 extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[]; 14 extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];
14 15
15 static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, in t count); 16 static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, in t count);
16 static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 17 static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
17 18
18 static unsigned SK_USHIFT16(unsigned x) {
19 return x >> 16;
20 }
21
22 #define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon 19 #define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon
23 #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) 20 #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max)
24 #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) 21 #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max)
25 #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) 22 #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
26 #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) 23 #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
27 #define CHECK_FOR_DECAL 24 #define CHECK_FOR_DECAL
28 #include "SkBitmapProcState_matrix_clamp_neon.h" 25 #include "SkBitmapProcState_matrix_clamp_neon.h"
29 26
30 #define MAKENAME(suffix) RepeatX_RepeatY ## suffix ## _neon 27 #define MAKENAME(suffix) RepeatX_RepeatY ## suffix ## _neon
31 #define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1)) 28 #define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1))
32 #define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1)) 29 #define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1))
33 #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 30 #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
34 #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 31 #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
35 #include "SkBitmapProcState_matrix_repeat_neon.h" 32 #include "SkBitmapProcState_matrix_repeat_neon.h"
36 33
37 34
38 void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count )
39 {
40 int i;
41 35
36 void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count ) {
42 if (count >= 8) { 37 if (count >= 8) {
43 /* SkFixed is 16.16 fixed point */ 38 // SkFixed is 16.16 fixed point
44 SkFixed dx2 = dx+dx; 39 SkFixed dx8 = dx * 8;
45 SkFixed dx4 = dx2+dx2; 40 int32x4_t vdx8 = vdupq_n_s32(dx8);
46 SkFixed dx8 = dx4+dx4;
47 41
48 /* now build fx/fx+dx/fx+2dx/fx+3dx */ 42 // setup lbase and hbase
49 SkFixed fx1, fx2, fx3;
50 int32x4_t lbase, hbase; 43 int32x4_t lbase, hbase;
51 uint16_t *dst16 = (uint16_t *)dst; 44 lbase = vdupq_n_s32(fx);
45 lbase = vsetq_lane_s32(fx + dx, lbase, 1);
46 lbase = vsetq_lane_s32(fx + dx + dx, lbase, 2);
47 lbase = vsetq_lane_s32(fx + dx + dx + dx, lbase, 3);
48 hbase = lbase + vdupq_n_s32(4 * dx);
52 49
53 fx1 = fx+dx; 50 do {
54 fx2 = fx1+dx; 51 // store the upper 16 bits
55 fx3 = fx2+dx; 52 vst1q_u32(dst, vreinterpretq_u32_s16(
53 vuzpq_s16(vreinterpretq_s16_s32(lbase), vreinterpretq_s16_s32(hb ase)).val[1]
54 ));
56 55
57 /* avoid an 'lbase unitialized' warning */ 56 // on to the next group of 8
58 lbase = vdupq_n_s32(fx); 57 lbase += vdx8;
59 lbase = vsetq_lane_s32(fx1, lbase, 1); 58 hbase += vdx8;
60 lbase = vsetq_lane_s32(fx2, lbase, 2); 59 dst += 4; // we did 8 elements but the result is twice smaller
61 lbase = vsetq_lane_s32(fx3, lbase, 3);
62 hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
63
64 /* take upper 16 of each, store, and bump everything */
65 do {
66 int32x4_t lout, hout;
67 uint16x8_t hi16;
68
69 lout = lbase;
70 hout = hbase;
71 /* gets hi's of all louts then hi's of all houts */
72 asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
73 hi16 = vreinterpretq_u16_s32(hout);
74 vst1q_u16(dst16, hi16);
75
76 /* on to the next */
77 lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
78 hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
79 dst16 += 8;
80 count -= 8; 60 count -= 8;
81 fx += dx8; 61 fx += dx8;
82 } while (count >= 8); 62 } while (count >= 8);
83 dst = (uint32_t *) dst16;
84 } 63 }
85 64
86 uint16_t* xx = (uint16_t*)dst; 65 uint16_t* xx = (uint16_t*)dst;
87 for (i = count; i > 0; --i) { 66 for (int i = count; i > 0; --i) {
88 *xx++ = SkToU16(fx >> 16); fx += dx; 67 *xx++ = SkToU16(fx >> 16); fx += dx;
89 } 68 }
90 } 69 }
91 70
92 void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) 71 void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) {
93 {
94 if (count >= 8) { 72 if (count >= 8) {
95 int32x4_t wide_fx; 73 SkFixed dx8 = dx * 8;
96 int32x4_t wide_fx2; 74 int32x4_t vdx8 = vdupq_n_s32(dx8);
97 int32x4_t wide_dx8 = vdupq_n_s32(dx*8);
98 75
76 int32x4_t wide_fx, wide_fx2;
99 wide_fx = vdupq_n_s32(fx); 77 wide_fx = vdupq_n_s32(fx);
100 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); 78 wide_fx = vsetq_lane_s32(fx + dx, wide_fx, 1);
101 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); 79 wide_fx = vsetq_lane_s32(fx + dx + dx, wide_fx, 2);
102 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); 80 wide_fx = vsetq_lane_s32(fx + dx + dx + dx, wide_fx, 3);
103 81
104 wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx)); 82 wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(4 * dx));
105 83
106 while (count >= 8) { 84 while (count >= 8) {
107 int32x4_t wide_out; 85 int32x4_t wide_out;
108 int32x4_t wide_out2; 86 int32x4_t wide_out2;
109 87
110 wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14); 88 wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
111 wide_out = vorrq_s32(wide_out, 89 wide_out = wide_out | (vshrq_n_s32(wide_fx,16) + vdupq_n_s32(1));
112 vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));
113 90
114 wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14); 91 wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
115 wide_out2 = vorrq_s32(wide_out2, 92 wide_out2 = wide_out2 | (vshrq_n_s32(wide_fx2,16) + vdupq_n_s32(1));
116 vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));
117 93
118 vst1q_u32(dst, vreinterpretq_u32_s32(wide_out)); 94 vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
119 vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2)); 95 vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
120 96
121 dst += 8; 97 dst += 8;
122 fx += dx*8; 98 fx += dx8;
123 wide_fx = vaddq_s32(wide_fx, wide_dx8); 99 wide_fx += vdx8;
124 wide_fx2 = vaddq_s32(wide_fx2, wide_dx8); 100 wide_fx2 += vdx8;
125 count -= 8; 101 count -= 8;
126 } 102 }
127 } 103 }
128 104
129 if (count & 1) 105 if (count & 1)
130 { 106 {
131 SkASSERT((fx >> (16 + 14)) == 0); 107 SkASSERT((fx >> (16 + 14)) == 0);
132 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 108 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
133 fx += dx; 109 fx += dx;
134 } 110 }
135 while ((count -= 2) >= 0) 111 while ((count -= 2) >= 0)
136 { 112 {
137 SkASSERT((fx >> (16 + 14)) == 0); 113 SkASSERT((fx >> (16 + 14)) == 0);
138 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 114 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
139 fx += dx; 115 fx += dx;
140 116
141 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 117 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
142 fx += dx; 118 fx += dx;
143 } 119 }
144 } 120 }
OLDNEW
« no previous file with comments | « src/core/SkBitmapProcState_utils.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698