src/opts/SkBitmapProcState_matrixProcs_neon.cpp - Issue 21931002: ARM Skia NEON patches - 18 - Preparation work for BitmapProcState

Unified Diff: src/opts/SkBitmapProcState_matrixProcs_neon.cpp

Issue 21931002: ARM Skia NEON patches - 18 - Preparation work for BitmapProcState (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/SkBitmapProcState_matrixProcs_neon.cpp

diff --git a/src/opts/SkBitmapProcState_matrixProcs_neon.cpp b/src/opts/SkBitmapProcState_matrixProcs_neon.cpp

index 7d75143e20bda1dd6d3687c6c35ba0da955f1a17..e81da6705263894b3d529f2782d0a10fd93c26ef 100644

--- a/src/opts/SkBitmapProcState_matrixProcs_neon.cpp

+++ b/src/opts/SkBitmapProcState_matrixProcs_neon.cpp

@@ -8,6 +8,7 @@

#include "SkPerspIter.h"

#include "SkShader.h"

#include "SkUtilsArm.h"

+#include "SkBitmapProcState_utils.h"

extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[];

extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];

@@ -15,10 +16,6 @@ extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];

static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);

static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);

-static unsigned SK_USHIFT16(unsigned x) {

- return x >> 16;

#define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon

#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max)

#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max)

@@ -35,93 +32,72 @@ static unsigned SK_USHIFT16(unsigned x) {

#include "SkBitmapProcState_matrix_repeat_neon.h"

-void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count)

- int i;

+void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) {

if (count >= 8) {

- /* SkFixed is 16.16 fixed point */

- SkFixed dx2 = dx+dx;

- SkFixed dx4 = dx2+dx2;

- SkFixed dx8 = dx4+dx4;

+ // SkFixed is 16.16 fixed point

+ SkFixed dx8 = dx * 8;

+ int32x4_t vdx8 = vdupq_n_s32(dx8);

- /* now build fx/fx+dx/fx+2dx/fx+3dx */

- SkFixed fx1, fx2, fx3;

+ // setup lbase and hbase

int32x4_t lbase, hbase;

- uint16_t *dst16 = (uint16_t *)dst;

- fx1 = fx+dx;

- fx2 = fx1+dx;

- fx3 = fx2+dx;

- /* avoid an 'lbase unitialized' warning */

lbase = vdupq_n_s32(fx);

- lbase = vsetq_lane_s32(fx1, lbase, 1);

- lbase = vsetq_lane_s32(fx2, lbase, 2);

- lbase = vsetq_lane_s32(fx3, lbase, 3);

- hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));

+ lbase = vsetq_lane_s32(fx + dx, lbase, 1);

+ lbase = vsetq_lane_s32(fx + dx + dx, lbase, 2);

+ lbase = vsetq_lane_s32(fx + dx + dx + dx, lbase, 3);

+ hbase = lbase + vdupq_n_s32(4 * dx);

- /* take upper 16 of each, store, and bump everything */

do {

- int32x4_t lout, hout;

- uint16x8_t hi16;

- lout = lbase;

- hout = hbase;

- /* gets hi's of all louts then hi's of all houts */

- asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));

- hi16 = vreinterpretq_u16_s32(hout);

- vst1q_u16(dst16, hi16);

- /* on to the next */

- lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));

- hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));

- dst16 += 8;

+ // store the upper 16 bits

+ vst1q_u32(dst, vreinterpretq_u32_s16(

+ vuzpq_s16(vreinterpretq_s16_s32(lbase), vreinterpretq_s16_s32(hbase)).val[1]

+ ));

+ // on to the next group of 8

+ lbase += vdx8;

+ hbase += vdx8;

+ dst += 4; // we did 8 elements but the result is twice smaller

count -= 8;

fx += dx8;

} while (count >= 8);

- dst = (uint32_t *) dst16;

}

uint16_t* xx = (uint16_t*)dst;

- for (i = count; i > 0; --i) {

+ for (int i = count; i > 0; --i) {

*xx++ = SkToU16(fx >> 16); fx += dx;

}

-void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count)

+void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) {

if (count >= 8) {

- int32x4_t wide_fx;

- int32x4_t wide_fx2;

- int32x4_t wide_dx8 = vdupq_n_s32(dx*8);

+ SkFixed dx8 = dx * 8;

+ int32x4_t vdx8 = vdupq_n_s32(dx8);

+ int32x4_t wide_fx, wide_fx2;

wide_fx = vdupq_n_s32(fx);

- wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);

- wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);

- wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);

+ wide_fx = vsetq_lane_s32(fx + dx, wide_fx, 1);

+ wide_fx = vsetq_lane_s32(fx + dx + dx, wide_fx, 2);

+ wide_fx = vsetq_lane_s32(fx + dx + dx + dx, wide_fx, 3);

- wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx));

+ wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(4 * dx));

while (count >= 8) {

int32x4_t wide_out;

int32x4_t wide_out2;

wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);

- wide_out = vorrq_s32(wide_out,

- vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));

+ wide_out = wide_out | (vshrq_n_s32(wide_fx,16) + vdupq_n_s32(1));

wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);

- wide_out2 = vorrq_s32(wide_out2,

- vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));

+ wide_out2 = wide_out2 | (vshrq_n_s32(wide_fx2,16) + vdupq_n_s32(1));

vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));

vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));

dst += 8;

- fx += dx*8;

- wide_fx = vaddq_s32(wide_fx, wide_dx8);

- wide_fx2 = vaddq_s32(wide_fx2, wide_dx8);

+ fx += dx8;

+ wide_fx += vdx8;

+ wide_fx2 += vdx8;

count -= 8;

}

« no previous file with comments | « src/core/SkBitmapProcState_utils.h ('k') | no next file » | no next file with comments »