Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(30)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 22269003: ARM Skia NEON patches - 23 - S32_D565_Opaque_Dither cleanup/bugfix/speed (Closed) Base URL: https://skia.googlecode.com/svn/trunk
Patch Set: Address review comments + clean comments Created 7 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm.h" 8 #include "SkBlitRow_opts_arm.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
(...skipping 1015 matching lines...) Expand 10 before | Expand all | Expand 10 after
1026 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1026 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1027 } 1027 }
1028 dst += 1; 1028 dst += 1;
1029 DITHER_INC_X(x); 1029 DITHER_INC_X(x);
1030 } while (--count != 0); 1030 } while (--count != 0);
1031 } 1031 }
1032 } 1032 }
1033 1033
1034 /////////////////////////////////////////////////////////////////////////////// 1034 ///////////////////////////////////////////////////////////////////////////////
1035 1035
1036 /* 2009/10/27: RBE says "a work in progress"; debugging says ok;
1037 * speedup untested, but ARM version is 26 insns/iteration and
1038 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
1039 * which is 10x the native version; that's pure instruction counts,
1040 * not accounting for any instruction or memory latencies.
1041 */
1042
1043 #undef DEBUG_S32_OPAQUE_DITHER 1036 #undef DEBUG_S32_OPAQUE_DITHER
1044 1037
1045 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 1038 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1046 const SkPMColor* SK_RESTRICT src, 1039 const SkPMColor* SK_RESTRICT src,
1047 int count, U8CPU alpha, int x, int y) { 1040 int count, U8CPU alpha, int x, int y) {
1048 SkASSERT(255 == alpha); 1041 SkASSERT(255 == alpha);
1049 1042
1050 #define UNROLL 8 1043 #define UNROLL 8
1051 if (count >= UNROLL) { 1044 if (count >= UNROLL) {
1052 uint8x8_t d; 1045 uint8x8_t d;
1053 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1046 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1054 d = vld1_u8(dstart); 1047 d = vld1_u8(dstart);
1055 1048
1056 while (count >= UNROLL) { 1049 while (count >= UNROLL) {
1057 uint8x8_t sr, sg, sb; 1050 uint8x8_t sr, sg, sb;
1058 uint16x8_t dr, dg, db; 1051 uint16x8_t dr, dg, db;
1059 uint16x8_t dst8; 1052 uint16x8_t dst8;
1060 1053
1061 /* source is in ABGR ordering (R == lsb) */
1062 { 1054 {
1063 register uint8x8_t d0 asm("d0"); 1055 register uint8x8_t d0 asm("d0");
1064 register uint8x8_t d1 asm("d1"); 1056 register uint8x8_t d1 asm("d1");
1065 register uint8x8_t d2 asm("d2"); 1057 register uint8x8_t d2 asm("d2");
1066 register uint8x8_t d3 asm("d3"); 1058 register uint8x8_t d3 asm("d3");
1067 1059
1068 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1060 asm (
1069 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 1061 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1070 : "r" (src) 1062 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1071 ); 1063 :
1072 sr = d0; sg = d1; sb = d2; 1064 );
1065 sg = d1;
1066 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1067 sr = d2; sb = d0;
1068 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1069 sr = d0; sb = d2;
1070 #endif
1073 } 1071 }
1074 /* XXX: if we want to prefetch, hide it in the above asm() 1072 /* XXX: if we want to prefetch, hide it in the above asm()
1075 * using the gcc __builtin_prefetch(), the prefetch will 1073 * using the gcc __builtin_prefetch(), the prefetch will
1076 * fall to the bottom of the loop -- it won't stick up 1074 * fall to the bottom of the loop -- it won't stick up
1077 * at the top of the loop, just after the vld4. 1075 * at the top of the loop, just after the vld4.
1078 */ 1076 */
1079 1077
1080 /* sr = sr - (sr>>5) + d */ 1078 // sr = sr - (sr>>5) + d
1081 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1079 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1082 dr = vaddl_u8(sr, d); 1080 dr = vaddl_u8(sr, d);
1083 1081
1084 /* sb = sb - (sb>>5) + d */ 1082 // sb = sb - (sb>>5) + d
1085 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1083 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1086 db = vaddl_u8(sb, d); 1084 db = vaddl_u8(sb, d);
1087 1085
1088 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 1086 // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1089 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1087 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1090 dg = vaddl_u8(sg, vshr_n_u8(d,1)); 1088 dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1091 /* XXX: check that the "d>>1" here is hoisted */
1092 1089
1093 /* pack high bits of each into 565 format (rgb, b is lsb) */ 1090 // pack high bits of each into 565 format (rgb, b is lsb)
1094 dst8 = vshrq_n_u16(db, 3); 1091 dst8 = vshrq_n_u16(db, 3);
1095 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 1092 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1096 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11); 1093 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1097 1094
1098 /* store it */ 1095 // store it
1099 vst1q_u16(dst, dst8); 1096 vst1q_u16(dst, dst8);
1100 1097
1101 #if defined(DEBUG_S32_OPAQUE_DITHER) 1098 #if defined(DEBUG_S32_OPAQUE_DITHER)
1102 /* always good to know if we generated good results */ 1099 // always good to know if we generated good results
1103 { 1100 {
1104 int i, myx = x, myy = y; 1101 int i, myx = x, myy = y;
1105 DITHER_565_SCAN(myy); 1102 DITHER_565_SCAN(myy);
1106 for (i=0;i<UNROLL;i++) { 1103 for (i=0;i<UNROLL;i++) {
1107 SkPMColor c = src[i]; 1104 // the '!' in the asm block above post-incremented src by the 8 pixe ls it reads.
1105 SkPMColor c = src[i-8];
1108 unsigned dither = DITHER_VALUE(myx); 1106 unsigned dither = DITHER_VALUE(myx);
1109 uint16_t val = SkDitherRGB32To565(c, dither); 1107 uint16_t val = SkDitherRGB32To565(c, dither);
1110 if (val != dst[i]) { 1108 if (val != dst[i]) {
1111 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x \n", 1109 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x \n",
1112 c, dither, val, dst[i], dstart[i]); 1110 c, dither, val, dst[i], dstart[i]);
1113 } 1111 }
1114 DITHER_INC_X(myx); 1112 DITHER_INC_X(myx);
1115 } 1113 }
1116 } 1114 }
1117 #endif 1115 #endif
1118 1116
1119 dst += UNROLL; 1117 dst += UNROLL;
1120 src += UNROLL; 1118 // we don't need to increment src as the asm above has already done it
1121 count -= UNROLL; 1119 count -= UNROLL;
1122 x += UNROLL; /* probably superfluous */ 1120 x += UNROLL; // probably superfluous
1123 } 1121 }
1124 } 1122 }
1125 #undef UNROLL 1123 #undef UNROLL
1126 1124
1127 /* residuals */ 1125 // residuals
1128 if (count > 0) { 1126 if (count > 0) {
1129 DITHER_565_SCAN(y); 1127 DITHER_565_SCAN(y);
1130 do { 1128 do {
1131 SkPMColor c = *src++; 1129 SkPMColor c = *src++;
1132 SkPMColorAssert(c); 1130 SkPMColorAssert(c);
1133 SkASSERT(SkGetPackedA32(c) == 255); 1131 SkASSERT(SkGetPackedA32(c) == 255);
1134 1132
1135 unsigned dither = DITHER_VALUE(x); 1133 unsigned dither = DITHER_VALUE(x);
1136 *dst++ = SkDitherRGB32To565(c, dither); 1134 *dst++ = SkDitherRGB32To565(c, dither);
1137 DITHER_INC_X(x); 1135 DITHER_INC_X(x);
(...skipping 130 matching lines...) Expand 10 before | Expand all | Expand 10 after
1268 * case where we do not inspect the src alpha. 1266 * case where we do not inspect the src alpha.
1269 */ 1267 */
1270 #if SK_A32_SHIFT == 24 1268 #if SK_A32_SHIFT == 24
1271 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1269 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1272 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1270 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1273 #else 1271 #else
1274 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1272 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1275 #endif 1273 #endif
1276 S32A_Blend_BlitRow32_arm // S32A_Blend 1274 S32A_Blend_BlitRow32_arm // S32A_Blend
1277 }; 1275 };
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698