Index: src/opts/SkBlitRow_opts_arm_neon.cpp |
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp |
index 00086c37898915d25ed463df09fdb05970346efd..9610afb562576c5084d254fd8003befb5efefbc5 100644 |
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp |
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp |
@@ -767,6 +767,98 @@ void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
} |
} |
+void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
+ const SkPMColor* SK_RESTRICT src, |
+ int count, U8CPU alpha) { |
+ |
+ SkASSERT(255 >= alpha); |
+ |
+ if (count <= 0) { |
+ return; |
+ } |
+ |
+ unsigned alpha256 = SkAlpha255To256(alpha); |
+ |
+ // First deal with odd counts |
+ if (count & 1) { |
+ uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; |
+ uint16x8_t vdst_wide, vsrc_wide; |
+ unsigned dst_scale; |
+ |
+ // Load |
+ vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); |
+ vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); |
+ |
+ // Calc dst_scale |
+ dst_scale = vget_lane_u8(vsrc, 3); |
+ dst_scale *= alpha256; |
+ dst_scale >>= 8; |
+ dst_scale = 256 - dst_scale; |
+ |
+ // Process src |
+ vsrc_wide = vmovl_u8(vsrc); |
+ vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); |
+ |
+ // Process dst |
+ vdst_wide = vmovl_u8(vdst); |
+ vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); |
+ |
+ // Combine |
+ vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
+ |
+ vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); |
+ dst++; |
+ src++; |
+ count--; |
+ } |
+ |
+ if (count) { |
+ uint8x8_t alpha_mask; |
+ static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |
+ alpha_mask = vld1_u8(alpha_mask_setup); |
+ |
+ do { |
+ |
+ uint8x8_t vsrc, vdst, vres, vsrc_alphas; |
+ uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; |
+ |
+ __builtin_prefetch(src+32); |
+ __builtin_prefetch(dst+32); |
+ |
+ // Load |
+ vsrc = vreinterpret_u8_u32(vld1_u32(src)); |
+ vdst = vreinterpret_u8_u32(vld1_u32(dst)); |
+ |
+ // Prepare src_scale |
+ vsrc_scale = vdupq_n_u16(alpha256); |
+ |
+ // Calc dst_scale |
+ vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); |
+ vdst_scale = vmovl_u8(vsrc_alphas); |
+ vdst_scale *= vsrc_scale; |
+ vdst_scale = vshrq_n_u16(vdst_scale, 8); |
+ vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); |
+ |
+ // Process src |
+ vsrc_wide = vmovl_u8(vsrc); |
+ vsrc_wide *= vsrc_scale; |
+ |
+ // Process dst |
+ vdst_wide = vmovl_u8(vdst); |
+ vdst_wide *= vdst_scale; |
+ |
+ // Combine |
+ vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
+ |
+ vst1_u32(dst, vreinterpret_u32_u8(vres)); |
+ |
+ src += 2; |
+ dst += 2; |
+ count -= 2; |
+ } while(count); |
+ } |
+} |
+ |
/////////////////////////////////////////////////////////////////////////////// |
#undef DEBUG_OPAQUE_DITHER |
@@ -1294,5 +1386,5 @@ const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |
#else |
S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
#endif |
- S32A_Blend_BlitRow32_arm // S32A_Blend |
+ S32A_Blend_BlitRow32_neon // S32A_Blend |
}; |