Index: src/core/SkBlitter_RGB16.cpp |
diff --git a/src/core/SkBlitter_RGB16.cpp b/src/core/SkBlitter_RGB16.cpp |
index 615a6a39dbe80fc7d5fefdc75e514a75b15fa296..86f0eb61576a89e8a0cc4bc4f2184865834f8fc2 100644 |
--- a/src/core/SkBlitter_RGB16.cpp |
+++ b/src/core/SkBlitter_RGB16.cpp |
@@ -390,63 +390,53 @@ void SkRGB16_Opaque_Blitter::blitMask(const SkMask& mask, |
do { |
int w = width; |
if (w >= UNROLL) { |
- uint32x4_t color; /* can use same one */ |
- uint32x4_t dev_lo, dev_hi; |
- uint32x4_t t1; |
- uint32x4_t wn1, wn2; |
- uint16x4_t odev_lo, odev_hi; |
- uint16x4_t alpha_lo, alpha_hi; |
- uint16x8_t alpha_full; |
- |
+ uint32x4_t color, dev_lo, dev_hi; |
+ uint32x4_t wn1, wn2, tmp; |
+ uint32x4_t vmask_g16, vmask_ng16; |
+ uint16x8_t valpha, vdev; |
+ uint16x4_t odev_lo, odev_hi, valpha_lo, valpha_hi; |
+ |
+ // prepare constants |
+ vmask_g16 = vdupq_n_u32(SK_G16_MASK_IN_PLACE); |
+ vmask_ng16 = vdupq_n_u32(~SK_G16_MASK_IN_PLACE); |
color = vdupq_n_u32(expanded32); |
do { |
- /* alpha is 8x8, widen and split to get pair of 16x4's */ |
- alpha_full = vmovl_u8(vld1_u8(alpha)); |
- alpha_full = vaddq_u16(alpha_full, vshrq_n_u16(alpha_full,7)); |
- alpha_full = vshrq_n_u16(alpha_full, 3); |
- alpha_lo = vget_low_u16(alpha_full); |
- alpha_hi = vget_high_u16(alpha_full); |
- |
- dev_lo = vmovl_u16(vld1_u16(device)); |
- dev_hi = vmovl_u16(vld1_u16(device+4)); |
- |
- /* unpack in 32 bits */ |
- dev_lo = vorrq_u32( |
- vandq_u32(dev_lo, vdupq_n_u32(0x0000F81F)), |
- vshlq_n_u32(vandq_u32(dev_lo, |
- vdupq_n_u32(0x000007E0)), |
- 16) |
- ); |
- dev_hi = vorrq_u32( |
- vandq_u32(dev_hi, vdupq_n_u32(0x0000F81F)), |
- vshlq_n_u32(vandq_u32(dev_hi, |
- vdupq_n_u32(0x000007E0)), |
- 16) |
- ); |
- |
- /* blend the two */ |
- t1 = vmulq_u32(vsubq_u32(color, dev_lo), vmovl_u16(alpha_lo)); |
- t1 = vshrq_n_u32(t1, 5); |
- dev_lo = vaddq_u32(dev_lo, t1); |
- |
- t1 = vmulq_u32(vsubq_u32(color, dev_hi), vmovl_u16(alpha_hi)); |
- t1 = vshrq_n_u32(t1, 5); |
- dev_hi = vaddq_u32(dev_hi, t1); |
- |
- /* re-compact and store */ |
- wn1 = vandq_u32(dev_lo, vdupq_n_u32(0x0000F81F)), |
- wn2 = vshrq_n_u32(dev_lo, 16); |
- wn2 = vandq_u32(wn2, vdupq_n_u32(0x000007E0)); |
- odev_lo = vmovn_u32(vorrq_u32(wn1, wn2)); |
- |
- wn1 = vandq_u32(dev_hi, vdupq_n_u32(0x0000F81F)), |
- wn2 = vshrq_n_u32(dev_hi, 16); |
- wn2 = vandq_u32(wn2, vdupq_n_u32(0x000007E0)); |
- odev_hi = vmovn_u32(vorrq_u32(wn1, wn2)); |
- |
- vst1_u16(device, odev_lo); |
- vst1_u16(device+4, odev_hi); |
+ // alpha is 8x8, widen and split to get a pair of 16x4 |
+ valpha = vaddw_u8(vdupq_n_u16(1), vld1_u8(alpha)); |
+ valpha = vshrq_n_u16(valpha, 3); |
+ valpha_lo = vget_low_u16(valpha); |
+ valpha_hi = vget_high_u16(valpha); |
+ |
+ // load pixels |
+ vdev = vld1q_u16(device); |
+ dev_lo = vmovl_u16(vget_low_u16(vdev)); |
+ dev_hi = vmovl_u16(vget_high_u16(vdev)); |
+ |
+ // unpack them in 32 bits |
+ dev_lo = (dev_lo & vmask_ng16) | vshlq_n_u32(dev_lo & vmask_g16, 16); |
+ dev_hi = (dev_hi & vmask_ng16) | vshlq_n_u32(dev_hi & vmask_g16, 16); |
+ |
+ // blend with color |
+ tmp = (color - dev_lo) * vmovl_u16(valpha_lo); |
+ tmp = vshrq_n_u32(tmp, 5); |
+ dev_lo += tmp; |
+ |
+ tmp = vmulq_u32(color - dev_hi, vmovl_u16(valpha_hi)); |
+ tmp = vshrq_n_u32(tmp, 5); |
+ dev_hi += tmp; |
+ |
+ // re-compact |
+ wn1 = dev_lo & vmask_ng16; |
+ wn2 = vshrq_n_u32(dev_lo, 16) & vmask_g16; |
+ odev_lo = vmovn_u32(wn1 | wn2); |
+ |
+ wn1 = dev_hi & vmask_ng16; |
+ wn2 = vshrq_n_u32(dev_hi, 16) & vmask_g16; |
+ odev_hi = vmovn_u32(wn1 | wn2); |
+ |
+ // store |
+ vst1q_u16(device, vcombine_u16(odev_lo, odev_hi)); |
device += UNROLL; |
alpha += UNROLL; |
@@ -454,7 +444,7 @@ void SkRGB16_Opaque_Blitter::blitMask(const SkMask& mask, |
} while (w >= UNROLL); |
} |
- /* residuals (which is everything if we have no neon) */ |
+ // residuals |
while (w > 0) { |
*device = blend_compact(expanded32, SkExpand_rgb_16(*device), |
SkAlpha255To256(*alpha++) >> 3); |