| Index: src/opts/SkBitmapProcState_matrixProcs_neon.cpp
|
| diff --git a/src/opts/SkBitmapProcState_matrixProcs_neon.cpp b/src/opts/SkBitmapProcState_matrixProcs_neon.cpp
|
| index 7d75143e20bda1dd6d3687c6c35ba0da955f1a17..e81da6705263894b3d529f2782d0a10fd93c26ef 100644
|
| --- a/src/opts/SkBitmapProcState_matrixProcs_neon.cpp
|
| +++ b/src/opts/SkBitmapProcState_matrixProcs_neon.cpp
|
| @@ -8,6 +8,7 @@
|
| #include "SkPerspIter.h"
|
| #include "SkShader.h"
|
| #include "SkUtilsArm.h"
|
| +#include "SkBitmapProcState_utils.h"
|
|
|
| extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[];
|
| extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];
|
| @@ -15,10 +16,6 @@ extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];
|
| static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
|
| static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
|
|
|
| -static unsigned SK_USHIFT16(unsigned x) {
|
| - return x >> 16;
|
| -}
|
| -
|
| #define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon
|
| #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max)
|
| #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max)
|
| @@ -35,93 +32,72 @@ static unsigned SK_USHIFT16(unsigned x) {
|
| #include "SkBitmapProcState_matrix_repeat_neon.h"
|
|
|
|
|
| -void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
|
| -{
|
| - int i;
|
|
|
| +void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) {
|
| if (count >= 8) {
|
| - /* SkFixed is 16.16 fixed point */
|
| - SkFixed dx2 = dx+dx;
|
| - SkFixed dx4 = dx2+dx2;
|
| - SkFixed dx8 = dx4+dx4;
|
| + // SkFixed is 16.16 fixed point
|
| + SkFixed dx8 = dx * 8;
|
| + int32x4_t vdx8 = vdupq_n_s32(dx8);
|
|
|
| - /* now build fx/fx+dx/fx+2dx/fx+3dx */
|
| - SkFixed fx1, fx2, fx3;
|
| + // setup lbase and hbase
|
| int32x4_t lbase, hbase;
|
| - uint16_t *dst16 = (uint16_t *)dst;
|
| -
|
| - fx1 = fx+dx;
|
| - fx2 = fx1+dx;
|
| - fx3 = fx2+dx;
|
| -
|
| - /* avoid an 'lbase unitialized' warning */
|
| lbase = vdupq_n_s32(fx);
|
| - lbase = vsetq_lane_s32(fx1, lbase, 1);
|
| - lbase = vsetq_lane_s32(fx2, lbase, 2);
|
| - lbase = vsetq_lane_s32(fx3, lbase, 3);
|
| - hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
|
| + lbase = vsetq_lane_s32(fx + dx, lbase, 1);
|
| + lbase = vsetq_lane_s32(fx + dx + dx, lbase, 2);
|
| + lbase = vsetq_lane_s32(fx + dx + dx + dx, lbase, 3);
|
| + hbase = lbase + vdupq_n_s32(4 * dx);
|
|
|
| - /* take upper 16 of each, store, and bump everything */
|
| do {
|
| - int32x4_t lout, hout;
|
| - uint16x8_t hi16;
|
| -
|
| - lout = lbase;
|
| - hout = hbase;
|
| - /* gets hi's of all louts then hi's of all houts */
|
| - asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
|
| - hi16 = vreinterpretq_u16_s32(hout);
|
| - vst1q_u16(dst16, hi16);
|
| -
|
| - /* on to the next */
|
| - lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
|
| - hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
|
| - dst16 += 8;
|
| + // store the upper 16 bits
|
| + vst1q_u32(dst, vreinterpretq_u32_s16(
|
| + vuzpq_s16(vreinterpretq_s16_s32(lbase), vreinterpretq_s16_s32(hbase)).val[1]
|
| + ));
|
| +
|
| + // on to the next group of 8
|
| + lbase += vdx8;
|
| + hbase += vdx8;
|
| + dst += 4; // we did 8 elements but the result is twice smaller
|
| count -= 8;
|
| fx += dx8;
|
| } while (count >= 8);
|
| - dst = (uint32_t *) dst16;
|
| }
|
|
|
| uint16_t* xx = (uint16_t*)dst;
|
| - for (i = count; i > 0; --i) {
|
| + for (int i = count; i > 0; --i) {
|
| *xx++ = SkToU16(fx >> 16); fx += dx;
|
| }
|
| }
|
|
|
| -void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
|
| -{
|
| +void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) {
|
| if (count >= 8) {
|
| - int32x4_t wide_fx;
|
| - int32x4_t wide_fx2;
|
| - int32x4_t wide_dx8 = vdupq_n_s32(dx*8);
|
| + SkFixed dx8 = dx * 8;
|
| + int32x4_t vdx8 = vdupq_n_s32(dx8);
|
|
|
| + int32x4_t wide_fx, wide_fx2;
|
| wide_fx = vdupq_n_s32(fx);
|
| - wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
|
| - wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
|
| - wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
|
| + wide_fx = vsetq_lane_s32(fx + dx, wide_fx, 1);
|
| + wide_fx = vsetq_lane_s32(fx + dx + dx, wide_fx, 2);
|
| + wide_fx = vsetq_lane_s32(fx + dx + dx + dx, wide_fx, 3);
|
|
|
| - wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx));
|
| + wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(4 * dx));
|
|
|
| while (count >= 8) {
|
| int32x4_t wide_out;
|
| int32x4_t wide_out2;
|
|
|
| wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
|
| - wide_out = vorrq_s32(wide_out,
|
| - vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));
|
| + wide_out = wide_out | (vshrq_n_s32(wide_fx,16) + vdupq_n_s32(1));
|
|
|
| wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
|
| - wide_out2 = vorrq_s32(wide_out2,
|
| - vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));
|
| + wide_out2 = wide_out2 | (vshrq_n_s32(wide_fx2,16) + vdupq_n_s32(1));
|
|
|
| vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
|
| vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
|
|
|
| dst += 8;
|
| - fx += dx*8;
|
| - wide_fx = vaddq_s32(wide_fx, wide_dx8);
|
| - wide_fx2 = vaddq_s32(wide_fx2, wide_dx8);
|
| + fx += dx8;
|
| + wide_fx += vdx8;
|
| + wide_fx2 += vdx8;
|
| count -= 8;
|
| }
|
| }
|
|
|