src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 22269003: ARM Skia NEON patches - 23 - S32_D565_Opaque_Dither cleanup/bugfix/speed

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 22269003: ARM Skia NEON patches - 23 - S32_D565_Opaque_Dither cleanup/bugfix/speed (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Address review comments + clean comments Created 7 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm.h"	8 #include "SkBlitRow_opts_arm.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 1015 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1026 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);	1026 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);

1027 }	1027 }

1028 dst += 1;	1028 dst += 1;

1029 DITHER_INC_X(x);	1029 DITHER_INC_X(x);

1030 } while (--count != 0);	1030 } while (--count != 0);

1031 }	1031 }

1032 }	1032 }

1033	1033

1034 ///////////////////////////////////////////////////////////////////////////////	1034 ///////////////////////////////////////////////////////////////////////////////

1035	1035

1036 /* 2009/10/27: RBE says "a work in progress"; debugging says ok;

1037 * speedup untested, but ARM version is 26 insns/iteration and

1038 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)

1039 * which is 10x the native version; that's pure instruction counts,

1040 * not accounting for any instruction or memory latencies.

1041 */

1042

1043 #undef DEBUG_S32_OPAQUE_DITHER	1036 #undef DEBUG_S32_OPAQUE_DITHER

1044	1037

1045 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,	1038 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,

1046 const SkPMColor* SK_RESTRICT src,	1039 const SkPMColor* SK_RESTRICT src,

1047 int count, U8CPU alpha, int x, int y) {	1040 int count, U8CPU alpha, int x, int y) {

1048 SkASSERT(255 == alpha);	1041 SkASSERT(255 == alpha);

1049	1042

1050 #define UNROLL 8	1043 #define UNROLL 8

1051 if (count >= UNROLL) {	1044 if (count >= UNROLL) {

1052 uint8x8_t d;	1045 uint8x8_t d;

1053 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];	1046 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];

1054 d = vld1_u8(dstart);	1047 d = vld1_u8(dstart);

1055	1048

1056 while (count >= UNROLL) {	1049 while (count >= UNROLL) {

1057 uint8x8_t sr, sg, sb;	1050 uint8x8_t sr, sg, sb;

1058 uint16x8_t dr, dg, db;	1051 uint16x8_t dr, dg, db;

1059 uint16x8_t dst8;	1052 uint16x8_t dst8;

1060	1053

1061 /* source is in ABGR ordering (R == lsb) */

1062 {	1054 {

1063 register uint8x8_t d0 asm("d0");	1055 register uint8x8_t d0 asm("d0");

1064 register uint8x8_t d1 asm("d1");	1056 register uint8x8_t d1 asm("d1");

1065 register uint8x8_t d2 asm("d2");	1057 register uint8x8_t d2 asm("d2");

1066 register uint8x8_t d3 asm("d3");	1058 register uint8x8_t d3 asm("d3");

1067	1059

1068 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"	1060 asm (

1069 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)	1061 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"

1070 : "r" (src)	1062 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)

1071 );	1063 :

1072 sr = d0; sg = d1; sb = d2;	1064 );

	1065 sg = d1;

	1066 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)

	1067 sr = d2; sb = d0;

	1068 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)

	1069 sr = d0; sb = d2;

	1070 #endif

1073 }	1071 }

1074 /* XXX: if we want to prefetch, hide it in the above asm()	1072 /* XXX: if we want to prefetch, hide it in the above asm()

1075 * using the gcc __builtin_prefetch(), the prefetch will	1073 * using the gcc __builtin_prefetch(), the prefetch will

1076 * fall to the bottom of the loop -- it won't stick up	1074 * fall to the bottom of the loop -- it won't stick up

1077 * at the top of the loop, just after the vld4.	1075 * at the top of the loop, just after the vld4.

1078 */	1076 */

1079	1077

1080 /* sr = sr - (sr>>5) + d */	1078 // sr = sr - (sr>>5) + d

1081 sr = vsub_u8(sr, vshr_n_u8(sr, 5));	1079 sr = vsub_u8(sr, vshr_n_u8(sr, 5));

1082 dr = vaddl_u8(sr, d);	1080 dr = vaddl_u8(sr, d);

1083	1081

1084 /* sb = sb - (sb>>5) + d */	1082 // sb = sb - (sb>>5) + d

1085 sb = vsub_u8(sb, vshr_n_u8(sb, 5));	1083 sb = vsub_u8(sb, vshr_n_u8(sb, 5));

1086 db = vaddl_u8(sb, d);	1084 db = vaddl_u8(sb, d);

1087	1085

1088 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */	1086 // sg = sg - (sg>>6) + d>>1; similar logic for overflows

1089 sg = vsub_u8(sg, vshr_n_u8(sg, 6));	1087 sg = vsub_u8(sg, vshr_n_u8(sg, 6));

1090 dg = vaddl_u8(sg, vshr_n_u8(d,1));	1088 dg = vaddl_u8(sg, vshr_n_u8(d, 1));

1091 /* XXX: check that the "d>>1" here is hoisted */

1092	1089

1093 /* pack high bits of each into 565 format (rgb, b is lsb) */	1090 // pack high bits of each into 565 format (rgb, b is lsb)

1094 dst8 = vshrq_n_u16(db, 3);	1091 dst8 = vshrq_n_u16(db, 3);

1095 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);	1092 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);

1096 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);	1093 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);

1097	1094

1098 /* store it */	1095 // store it

1099 vst1q_u16(dst, dst8);	1096 vst1q_u16(dst, dst8);

1100	1097

1101 #if defined(DEBUG_S32_OPAQUE_DITHER)	1098 #if defined(DEBUG_S32_OPAQUE_DITHER)

1102 /* always good to know if we generated good results */	1099 // always good to know if we generated good results

1103 {	1100 {

1104 int i, myx = x, myy = y;	1101 int i, myx = x, myy = y;

1105 DITHER_565_SCAN(myy);	1102 DITHER_565_SCAN(myy);

1106 for (i=0;i<UNROLL;i++) {	1103 for (i=0;i<UNROLL;i++) {

1107 SkPMColor c = src[i];	1104 // the '!' in the asm block above post-incremented src by the 8 pixe ls it reads.

	1105 SkPMColor c = src[i-8];

1108 unsigned dither = DITHER_VALUE(myx);	1106 unsigned dither = DITHER_VALUE(myx);

1109 uint16_t val = SkDitherRGB32To565(c, dither);	1107 uint16_t val = SkDitherRGB32To565(c, dither);

1110 if (val != dst[i]) {	1108 if (val != dst[i]) {

1111 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x \n",	1109 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x \n",

1112 c, dither, val, dst[i], dstart[i]);	1110 c, dither, val, dst[i], dstart[i]);

1113 }	1111 }

1114 DITHER_INC_X(myx);	1112 DITHER_INC_X(myx);

1115 }	1113 }

1116 }	1114 }

1117 #endif	1115 #endif

1118	1116

1119 dst += UNROLL;	1117 dst += UNROLL;

1120 src += UNROLL;	1118 // we don't need to increment src as the asm above has already done it

1121 count -= UNROLL;	1119 count -= UNROLL;

1122 x += UNROLL; /* probably superfluous */	1120 x += UNROLL; // probably superfluous

1123 }	1121 }

1124 }	1122 }

1125 #undef UNROLL	1123 #undef UNROLL

1126	1124

1127 /* residuals */	1125 // residuals

1128 if (count > 0) {	1126 if (count > 0) {

1129 DITHER_565_SCAN(y);	1127 DITHER_565_SCAN(y);

1130 do {	1128 do {

1131 SkPMColor c = *src++;	1129 SkPMColor c = *src++;

1132 SkPMColorAssert(c);	1130 SkPMColorAssert(c);

1133 SkASSERT(SkGetPackedA32(c) == 255);	1131 SkASSERT(SkGetPackedA32(c) == 255);

1134	1132

1135 unsigned dither = DITHER_VALUE(x);	1133 unsigned dither = DITHER_VALUE(x);

1136 *dst++ = SkDitherRGB32To565(c, dither);	1134 *dst++ = SkDitherRGB32To565(c, dither);

1137 DITHER_INC_X(x);	1135 DITHER_INC_X(x);

(...skipping 130 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1268 * case where we do not inspect the src alpha.	1266 * case where we do not inspect the src alpha.

1269 */	1267 */

1270 #if SK_A32_SHIFT == 24	1268 #if SK_A32_SHIFT == 24

1271 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1269 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1272 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1270 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1273 #else	1271 #else

1274 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1272 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1275 #endif	1273 #endif

1276 S32A_Blend_BlitRow32_arm // S32A_Blend	1274 S32A_Blend_BlitRow32_arm // S32A_Blend

1277 };	1275 };

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »