| Index: src/opts/SkBlitRow_opts_arm_neon.cpp | 
| diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp | 
| index 705ee998ddcfd6e64cb011c71cf7d8f25bbc0cd0..ffa0a8b3e41f981bc3499cf0b3322c3f89210fe9 100644 | 
| --- a/src/opts/SkBlitRow_opts_arm_neon.cpp | 
| +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp | 
| @@ -15,9 +15,45 @@ | 
| #include "SkUtils.h" | 
|  | 
| #include "SkCachePreload_arm.h" | 
| - | 
| +#include "SkColor_opts_neon.h" | 
| #include <arm_neon.h> | 
|  | 
| +void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, | 
| +                           const SkPMColor* SK_RESTRICT src, int count, | 
| +                           U8CPU alpha, int /*x*/, int /*y*/) { | 
| +    SkASSERT(255 == alpha); | 
| + | 
| +    while (count >= 8) { | 
| +        uint8x8x4_t vsrc; | 
| +        uint16x8_t vdst; | 
| + | 
| +        // Load | 
| +        vsrc = vld4_u8((uint8_t*)src); | 
| + | 
| +        // Convert src to 565 | 
| +        vdst = vshll_n_u8(vsrc.val[NEON_R], 8); | 
| +        vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_G], 8), 5); | 
| +        vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_B], 8), 5+6); | 
| + | 
| +        // Store | 
| +        vst1q_u16(dst, vdst); | 
| + | 
| +        // Prepare next iteration | 
| +        dst += 8; | 
| +        src += 8; | 
| +        count -= 8; | 
| +    }; | 
| + | 
| +    // Leftovers | 
| +    while (count > 0) { | 
| +        SkPMColor c = *src++; | 
| +        SkPMColorAssert(c); | 
| +        *dst = SkPixel32ToPixel16_ToU16(c); | 
| +        dst++; | 
| +        count--; | 
| +    }; | 
| +} | 
| + | 
| void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, | 
| const SkPMColor* SK_RESTRICT src, int count, | 
| U8CPU alpha, int /*x*/, int /*y*/) { | 
| @@ -1330,10 +1366,10 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, | 
|  | 
| const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { | 
| // no dither | 
| -    // NOTE: For the two functions below, we don't have a special version | 
| -    //       that assumes that each source pixel is opaque. But our S32A is | 
| -    //       still faster than the default, so use it. | 
| -    S32A_D565_Opaque_neon,  // really S32_D565_Opaque | 
| +    // NOTE: For the S32_D565_Blend function below, we don't have a special | 
| +    //       version that assumes that each source pixel is opaque. But our | 
| +    //       S32A is still faster than the default, so use it. | 
| +    S32_D565_Opaque_neon, | 
| S32A_D565_Blend_neon,   // really S32_D565_Blend | 
| S32A_D565_Opaque_neon, | 
| S32A_D565_Blend_neon, | 
|  |