| Index: src/core/SkBlitter_RGB16.cpp
|
| diff --git a/src/core/SkBlitter_RGB16.cpp b/src/core/SkBlitter_RGB16.cpp
|
| index 615a6a39dbe80fc7d5fefdc75e514a75b15fa296..86f0eb61576a89e8a0cc4bc4f2184865834f8fc2 100644
|
| --- a/src/core/SkBlitter_RGB16.cpp
|
| +++ b/src/core/SkBlitter_RGB16.cpp
|
| @@ -390,63 +390,53 @@ void SkRGB16_Opaque_Blitter::blitMask(const SkMask& mask,
|
| do {
|
| int w = width;
|
| if (w >= UNROLL) {
|
| - uint32x4_t color; /* can use same one */
|
| - uint32x4_t dev_lo, dev_hi;
|
| - uint32x4_t t1;
|
| - uint32x4_t wn1, wn2;
|
| - uint16x4_t odev_lo, odev_hi;
|
| - uint16x4_t alpha_lo, alpha_hi;
|
| - uint16x8_t alpha_full;
|
| -
|
| + uint32x4_t color, dev_lo, dev_hi;
|
| + uint32x4_t wn1, wn2, tmp;
|
| + uint32x4_t vmask_g16, vmask_ng16;
|
| + uint16x8_t valpha, vdev;
|
| + uint16x4_t odev_lo, odev_hi, valpha_lo, valpha_hi;
|
| +
|
| + // prepare constants
|
| + vmask_g16 = vdupq_n_u32(SK_G16_MASK_IN_PLACE);
|
| + vmask_ng16 = vdupq_n_u32(~SK_G16_MASK_IN_PLACE);
|
| color = vdupq_n_u32(expanded32);
|
|
|
| do {
|
| - /* alpha is 8x8, widen and split to get pair of 16x4's */
|
| - alpha_full = vmovl_u8(vld1_u8(alpha));
|
| - alpha_full = vaddq_u16(alpha_full, vshrq_n_u16(alpha_full,7));
|
| - alpha_full = vshrq_n_u16(alpha_full, 3);
|
| - alpha_lo = vget_low_u16(alpha_full);
|
| - alpha_hi = vget_high_u16(alpha_full);
|
| -
|
| - dev_lo = vmovl_u16(vld1_u16(device));
|
| - dev_hi = vmovl_u16(vld1_u16(device+4));
|
| -
|
| - /* unpack in 32 bits */
|
| - dev_lo = vorrq_u32(
|
| - vandq_u32(dev_lo, vdupq_n_u32(0x0000F81F)),
|
| - vshlq_n_u32(vandq_u32(dev_lo,
|
| - vdupq_n_u32(0x000007E0)),
|
| - 16)
|
| - );
|
| - dev_hi = vorrq_u32(
|
| - vandq_u32(dev_hi, vdupq_n_u32(0x0000F81F)),
|
| - vshlq_n_u32(vandq_u32(dev_hi,
|
| - vdupq_n_u32(0x000007E0)),
|
| - 16)
|
| - );
|
| -
|
| - /* blend the two */
|
| - t1 = vmulq_u32(vsubq_u32(color, dev_lo), vmovl_u16(alpha_lo));
|
| - t1 = vshrq_n_u32(t1, 5);
|
| - dev_lo = vaddq_u32(dev_lo, t1);
|
| -
|
| - t1 = vmulq_u32(vsubq_u32(color, dev_hi), vmovl_u16(alpha_hi));
|
| - t1 = vshrq_n_u32(t1, 5);
|
| - dev_hi = vaddq_u32(dev_hi, t1);
|
| -
|
| - /* re-compact and store */
|
| - wn1 = vandq_u32(dev_lo, vdupq_n_u32(0x0000F81F)),
|
| - wn2 = vshrq_n_u32(dev_lo, 16);
|
| - wn2 = vandq_u32(wn2, vdupq_n_u32(0x000007E0));
|
| - odev_lo = vmovn_u32(vorrq_u32(wn1, wn2));
|
| -
|
| - wn1 = vandq_u32(dev_hi, vdupq_n_u32(0x0000F81F)),
|
| - wn2 = vshrq_n_u32(dev_hi, 16);
|
| - wn2 = vandq_u32(wn2, vdupq_n_u32(0x000007E0));
|
| - odev_hi = vmovn_u32(vorrq_u32(wn1, wn2));
|
| -
|
| - vst1_u16(device, odev_lo);
|
| - vst1_u16(device+4, odev_hi);
|
| + // alpha is 8x8, widen and split to get a pair of 16x4
|
| + valpha = vaddw_u8(vdupq_n_u16(1), vld1_u8(alpha));
|
| + valpha = vshrq_n_u16(valpha, 3);
|
| + valpha_lo = vget_low_u16(valpha);
|
| + valpha_hi = vget_high_u16(valpha);
|
| +
|
| + // load pixels
|
| + vdev = vld1q_u16(device);
|
| + dev_lo = vmovl_u16(vget_low_u16(vdev));
|
| + dev_hi = vmovl_u16(vget_high_u16(vdev));
|
| +
|
| + // unpack them in 32 bits
|
| + dev_lo = (dev_lo & vmask_ng16) | vshlq_n_u32(dev_lo & vmask_g16, 16);
|
| + dev_hi = (dev_hi & vmask_ng16) | vshlq_n_u32(dev_hi & vmask_g16, 16);
|
| +
|
| + // blend with color
|
| + tmp = (color - dev_lo) * vmovl_u16(valpha_lo);
|
| + tmp = vshrq_n_u32(tmp, 5);
|
| + dev_lo += tmp;
|
| +
|
| + tmp = vmulq_u32(color - dev_hi, vmovl_u16(valpha_hi));
|
| + tmp = vshrq_n_u32(tmp, 5);
|
| + dev_hi += tmp;
|
| +
|
| + // re-compact
|
| + wn1 = dev_lo & vmask_ng16;
|
| + wn2 = vshrq_n_u32(dev_lo, 16) & vmask_g16;
|
| + odev_lo = vmovn_u32(wn1 | wn2);
|
| +
|
| + wn1 = dev_hi & vmask_ng16;
|
| + wn2 = vshrq_n_u32(dev_hi, 16) & vmask_g16;
|
| + odev_hi = vmovn_u32(wn1 | wn2);
|
| +
|
| + // store
|
| + vst1q_u16(device, vcombine_u16(odev_lo, odev_hi));
|
|
|
| device += UNROLL;
|
| alpha += UNROLL;
|
| @@ -454,7 +444,7 @@ void SkRGB16_Opaque_Blitter::blitMask(const SkMask& mask,
|
| } while (w >= UNROLL);
|
| }
|
|
|
| - /* residuals (which is everything if we have no neon) */
|
| + // residuals
|
| while (w > 0) {
|
| *device = blend_compact(expanded32, SkExpand_rgb_16(*device),
|
| SkAlpha255To256(*alpha++) >> 3);
|
|
|