src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 22269003: ARM Skia NEON patches - 23 - S32_D565_Opaque_Dither cleanup/bugfix/speed

Unified Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 22269003: ARM Skia NEON patches - 23 - S32_D565_Opaque_Dither cleanup/bugfix/speed (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Address review comments + clean comments Created 7 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/SkBlitRow_opts_arm_neon.cpp

diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp

index 7868108378bbab635ac8fa99d4b8fb458db775de..9d50aceaf59b5a78211bc57a644e142beddf6511 100644

--- a/src/opts/SkBlitRow_opts_arm_neon.cpp

+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp

@@ -1033,13 +1033,6 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,

///////////////////////////////////////////////////////////////////////////////

-/* 2009/10/27: RBE says "a work in progress"; debugging says ok;

- * speedup untested, but ARM version is 26 insns/iteration and

- * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)

- * which is 10x the native version; that's pure instruction counts,

- * not accounting for any instruction or memory latencies.

- */

#undef DEBUG_S32_OPAQUE_DITHER

void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,

@@ -1058,18 +1051,23 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,

uint16x8_t dr, dg, db;

uint16x8_t dst8;

- /* source is in ABGR ordering (R == lsb) */

{

- asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"

- : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)

- : "r" (src)

- );

- sr = d0; sg = d1; sb = d2;

+ asm (

+ "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"

+ : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)

+ :

+ );

+ sg = d1;

+#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)

+ sr = d2; sb = d0;

+#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)

+ sr = d0; sb = d2;

+#endif

}

/* XXX: if we want to prefetch, hide it in the above asm()

* using the gcc __builtin_prefetch(), the prefetch will

@@ -1077,34 +1075,34 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,

* at the top of the loop, just after the vld4.

- /* sr = sr - (sr>>5) + d */

+ // sr = sr - (sr>>5) + d

sr = vsub_u8(sr, vshr_n_u8(sr, 5));

dr = vaddl_u8(sr, d);

- /* sb = sb - (sb>>5) + d */

+ // sb = sb - (sb>>5) + d

sb = vsub_u8(sb, vshr_n_u8(sb, 5));

db = vaddl_u8(sb, d);

- /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */

+ // sg = sg - (sg>>6) + d>>1; similar logic for overflows

sg = vsub_u8(sg, vshr_n_u8(sg, 6));

- dg = vaddl_u8(sg, vshr_n_u8(d,1));

- /* XXX: check that the "d>>1" here is hoisted */

+ dg = vaddl_u8(sg, vshr_n_u8(d, 1));

- /* pack high bits of each into 565 format (rgb, b is lsb) */

+ // pack high bits of each into 565 format (rgb, b is lsb)

dst8 = vshrq_n_u16(db, 3);

dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);

- dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);

+ dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);

- /* store it */

+ // store it

vst1q_u16(dst, dst8);

#if defined(DEBUG_S32_OPAQUE_DITHER)

- /* always good to know if we generated good results */

+ // always good to know if we generated good results

{

int i, myx = x, myy = y;

DITHER_565_SCAN(myy);

for (i=0;i<UNROLL;i++) {

- SkPMColor c = src[i];

+ // the '!' in the asm block above post-incremented src by the 8 pixels it reads.

+ SkPMColor c = src[i-8];

unsigned dither = DITHER_VALUE(myx);

uint16_t val = SkDitherRGB32To565(c, dither);

if (val != dst[i]) {

@@ -1117,14 +1115,14 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,

#endif

dst += UNROLL;

- src += UNROLL;

+ // we don't need to increment src as the asm above has already done it

count -= UNROLL;

- x += UNROLL; /* probably superfluous */

+ x += UNROLL; // probably superfluous

}

#undef UNROLL

- /* residuals */

+ // residuals

if (count > 0) {

DITHER_565_SCAN(y);

do {

« no previous file with comments | « no previous file | no next file » | no next file with comments »