Index: src/opts/SkBlitRow_opts_arm_neon.cpp |
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp |
index 7868108378bbab635ac8fa99d4b8fb458db775de..9d50aceaf59b5a78211bc57a644e142beddf6511 100644 |
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp |
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp |
@@ -1033,13 +1033,6 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
/////////////////////////////////////////////////////////////////////////////// |
-/* 2009/10/27: RBE says "a work in progress"; debugging says ok; |
- * speedup untested, but ARM version is 26 insns/iteration and |
- * this NEON version is 21 insns/iteration-of-8 (2.62insns/element) |
- * which is 10x the native version; that's pure instruction counts, |
- * not accounting for any instruction or memory latencies. |
- */ |
- |
#undef DEBUG_S32_OPAQUE_DITHER |
void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
@@ -1058,18 +1051,23 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
uint16x8_t dr, dg, db; |
uint16x8_t dst8; |
- /* source is in ABGR ordering (R == lsb) */ |
{ |
register uint8x8_t d0 asm("d0"); |
register uint8x8_t d1 asm("d1"); |
register uint8x8_t d2 asm("d2"); |
register uint8x8_t d3 asm("d3"); |
- asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
- : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) |
- : "r" (src) |
- ); |
- sr = d0; sg = d1; sb = d2; |
+ asm ( |
+ "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
+ : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
+ : |
+ ); |
+ sg = d1; |
+#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) |
+ sr = d2; sb = d0; |
+#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) |
+ sr = d0; sb = d2; |
+#endif |
} |
/* XXX: if we want to prefetch, hide it in the above asm() |
* using the gcc __builtin_prefetch(), the prefetch will |
@@ -1077,34 +1075,34 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
* at the top of the loop, just after the vld4. |
*/ |
- /* sr = sr - (sr>>5) + d */ |
+ // sr = sr - (sr>>5) + d |
sr = vsub_u8(sr, vshr_n_u8(sr, 5)); |
dr = vaddl_u8(sr, d); |
- /* sb = sb - (sb>>5) + d */ |
+ // sb = sb - (sb>>5) + d |
sb = vsub_u8(sb, vshr_n_u8(sb, 5)); |
db = vaddl_u8(sb, d); |
- /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ |
+ // sg = sg - (sg>>6) + d>>1; similar logic for overflows |
sg = vsub_u8(sg, vshr_n_u8(sg, 6)); |
- dg = vaddl_u8(sg, vshr_n_u8(d,1)); |
- /* XXX: check that the "d>>1" here is hoisted */ |
+ dg = vaddl_u8(sg, vshr_n_u8(d, 1)); |
- /* pack high bits of each into 565 format (rgb, b is lsb) */ |
+ // pack high bits of each into 565 format (rgb, b is lsb) |
dst8 = vshrq_n_u16(db, 3); |
dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); |
- dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11); |
+ dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); |
- /* store it */ |
+ // store it |
vst1q_u16(dst, dst8); |
#if defined(DEBUG_S32_OPAQUE_DITHER) |
- /* always good to know if we generated good results */ |
+ // always good to know if we generated good results |
{ |
int i, myx = x, myy = y; |
DITHER_565_SCAN(myy); |
for (i=0;i<UNROLL;i++) { |
- SkPMColor c = src[i]; |
+ // the '!' in the asm block above post-incremented src by the 8 pixels it reads. |
+ SkPMColor c = src[i-8]; |
unsigned dither = DITHER_VALUE(myx); |
uint16_t val = SkDitherRGB32To565(c, dither); |
if (val != dst[i]) { |
@@ -1117,14 +1115,14 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
#endif |
dst += UNROLL; |
- src += UNROLL; |
+ // we don't need to increment src as the asm above has already done it |
count -= UNROLL; |
- x += UNROLL; /* probably superfluous */ |
+ x += UNROLL; // probably superfluous |
} |
} |
#undef UNROLL |
- /* residuals */ |
+ // residuals |
if (count > 0) { |
DITHER_565_SCAN(y); |
do { |