OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm.h" | 8 #include "SkBlitRow_opts_arm.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 1015 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1026 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); | 1026 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); |
1027 } | 1027 } |
1028 dst += 1; | 1028 dst += 1; |
1029 DITHER_INC_X(x); | 1029 DITHER_INC_X(x); |
1030 } while (--count != 0); | 1030 } while (--count != 0); |
1031 } | 1031 } |
1032 } | 1032 } |
1033 | 1033 |
1034 /////////////////////////////////////////////////////////////////////////////// | 1034 /////////////////////////////////////////////////////////////////////////////// |
1035 | 1035 |
1036 /* 2009/10/27: RBE says "a work in progress"; debugging says ok; | |
1037 * speedup untested, but ARM version is 26 insns/iteration and | |
1038 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element) | |
1039 * which is 10x the native version; that's pure instruction counts, | |
1040 * not accounting for any instruction or memory latencies. | |
1041 */ | |
1042 | |
1043 #undef DEBUG_S32_OPAQUE_DITHER | 1036 #undef DEBUG_S32_OPAQUE_DITHER |
1044 | 1037 |
1045 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, | 1038 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
1046 const SkPMColor* SK_RESTRICT src, | 1039 const SkPMColor* SK_RESTRICT src, |
1047 int count, U8CPU alpha, int x, int y) { | 1040 int count, U8CPU alpha, int x, int y) { |
1048 SkASSERT(255 == alpha); | 1041 SkASSERT(255 == alpha); |
1049 | 1042 |
1050 #define UNROLL 8 | 1043 #define UNROLL 8 |
1051 if (count >= UNROLL) { | 1044 if (count >= UNROLL) { |
1052 uint8x8_t d; | 1045 uint8x8_t d; |
1053 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | 1046 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
1054 d = vld1_u8(dstart); | 1047 d = vld1_u8(dstart); |
1055 | 1048 |
1056 while (count >= UNROLL) { | 1049 while (count >= UNROLL) { |
1057 uint8x8_t sr, sg, sb; | 1050 uint8x8_t sr, sg, sb; |
1058 uint16x8_t dr, dg, db; | 1051 uint16x8_t dr, dg, db; |
1059 uint16x8_t dst8; | 1052 uint16x8_t dst8; |
1060 | 1053 |
1061 /* source is in ABGR ordering (R == lsb) */ | |
1062 { | 1054 { |
1063 register uint8x8_t d0 asm("d0"); | 1055 register uint8x8_t d0 asm("d0"); |
1064 register uint8x8_t d1 asm("d1"); | 1056 register uint8x8_t d1 asm("d1"); |
1065 register uint8x8_t d2 asm("d2"); | 1057 register uint8x8_t d2 asm("d2"); |
1066 register uint8x8_t d3 asm("d3"); | 1058 register uint8x8_t d3 asm("d3"); |
1067 | 1059 |
1068 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" | 1060 asm ( |
1069 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) | 1061 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
1070 : "r" (src) | 1062 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
1071 ); | 1063 : |
1072 sr = d0; sg = d1; sb = d2; | 1064 ); |
| 1065 sg = d1; |
| 1066 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) |
| 1067 sr = d2; sb = d0; |
| 1068 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) |
| 1069 sr = d0; sb = d2; |
| 1070 #endif |
1073 } | 1071 } |
1074 /* XXX: if we want to prefetch, hide it in the above asm() | 1072 /* XXX: if we want to prefetch, hide it in the above asm() |
1075 * using the gcc __builtin_prefetch(), the prefetch will | 1073 * using the gcc __builtin_prefetch(), the prefetch will |
1076 * fall to the bottom of the loop -- it won't stick up | 1074 * fall to the bottom of the loop -- it won't stick up |
1077 * at the top of the loop, just after the vld4. | 1075 * at the top of the loop, just after the vld4. |
1078 */ | 1076 */ |
1079 | 1077 |
1080 /* sr = sr - (sr>>5) + d */ | 1078 // sr = sr - (sr>>5) + d |
1081 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); | 1079 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); |
1082 dr = vaddl_u8(sr, d); | 1080 dr = vaddl_u8(sr, d); |
1083 | 1081 |
1084 /* sb = sb - (sb>>5) + d */ | 1082 // sb = sb - (sb>>5) + d |
1085 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); | 1083 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); |
1086 db = vaddl_u8(sb, d); | 1084 db = vaddl_u8(sb, d); |
1087 | 1085 |
1088 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ | 1086 // sg = sg - (sg>>6) + d>>1; similar logic for overflows |
1089 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); | 1087 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); |
1090 dg = vaddl_u8(sg, vshr_n_u8(d,1)); | 1088 dg = vaddl_u8(sg, vshr_n_u8(d, 1)); |
1091 /* XXX: check that the "d>>1" here is hoisted */ | |
1092 | 1089 |
1093 /* pack high bits of each into 565 format (rgb, b is lsb) */ | 1090 // pack high bits of each into 565 format (rgb, b is lsb) |
1094 dst8 = vshrq_n_u16(db, 3); | 1091 dst8 = vshrq_n_u16(db, 3); |
1095 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); | 1092 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); |
1096 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11); | 1093 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); |
1097 | 1094 |
1098 /* store it */ | 1095 // store it |
1099 vst1q_u16(dst, dst8); | 1096 vst1q_u16(dst, dst8); |
1100 | 1097 |
1101 #if defined(DEBUG_S32_OPAQUE_DITHER) | 1098 #if defined(DEBUG_S32_OPAQUE_DITHER) |
1102 /* always good to know if we generated good results */ | 1099 // always good to know if we generated good results |
1103 { | 1100 { |
1104 int i, myx = x, myy = y; | 1101 int i, myx = x, myy = y; |
1105 DITHER_565_SCAN(myy); | 1102 DITHER_565_SCAN(myy); |
1106 for (i=0;i<UNROLL;i++) { | 1103 for (i=0;i<UNROLL;i++) { |
1107 SkPMColor c = src[i]; | 1104 // the '!' in the asm block above post-incremented src by the 8 pixe
ls it reads. |
| 1105 SkPMColor c = src[i-8]; |
1108 unsigned dither = DITHER_VALUE(myx); | 1106 unsigned dither = DITHER_VALUE(myx); |
1109 uint16_t val = SkDitherRGB32To565(c, dither); | 1107 uint16_t val = SkDitherRGB32To565(c, dither); |
1110 if (val != dst[i]) { | 1108 if (val != dst[i]) { |
1111 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x
\n", | 1109 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x
\n", |
1112 c, dither, val, dst[i], dstart[i]); | 1110 c, dither, val, dst[i], dstart[i]); |
1113 } | 1111 } |
1114 DITHER_INC_X(myx); | 1112 DITHER_INC_X(myx); |
1115 } | 1113 } |
1116 } | 1114 } |
1117 #endif | 1115 #endif |
1118 | 1116 |
1119 dst += UNROLL; | 1117 dst += UNROLL; |
1120 src += UNROLL; | 1118 // we don't need to increment src as the asm above has already done it |
1121 count -= UNROLL; | 1119 count -= UNROLL; |
1122 x += UNROLL; /* probably superfluous */ | 1120 x += UNROLL; // probably superfluous |
1123 } | 1121 } |
1124 } | 1122 } |
1125 #undef UNROLL | 1123 #undef UNROLL |
1126 | 1124 |
1127 /* residuals */ | 1125 // residuals |
1128 if (count > 0) { | 1126 if (count > 0) { |
1129 DITHER_565_SCAN(y); | 1127 DITHER_565_SCAN(y); |
1130 do { | 1128 do { |
1131 SkPMColor c = *src++; | 1129 SkPMColor c = *src++; |
1132 SkPMColorAssert(c); | 1130 SkPMColorAssert(c); |
1133 SkASSERT(SkGetPackedA32(c) == 255); | 1131 SkASSERT(SkGetPackedA32(c) == 255); |
1134 | 1132 |
1135 unsigned dither = DITHER_VALUE(x); | 1133 unsigned dither = DITHER_VALUE(x); |
1136 *dst++ = SkDitherRGB32To565(c, dither); | 1134 *dst++ = SkDitherRGB32To565(c, dither); |
1137 DITHER_INC_X(x); | 1135 DITHER_INC_X(x); |
(...skipping 130 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1268 * case where we do not inspect the src alpha. | 1266 * case where we do not inspect the src alpha. |
1269 */ | 1267 */ |
1270 #if SK_A32_SHIFT == 24 | 1268 #if SK_A32_SHIFT == 24 |
1271 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1269 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
1272 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1270 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
1273 #else | 1271 #else |
1274 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1272 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
1275 #endif | 1273 #endif |
1276 S32A_Blend_BlitRow32_arm // S32A_Blend | 1274 S32A_Blend_BlitRow32_arm // S32A_Blend |
1277 }; | 1275 }; |
OLD | NEW |