| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "SkBlitRow_opts_arm.h" | 8 #include "SkBlitRow_opts_arm.h" |
| 9 | 9 |
| 10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
| (...skipping 749 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 760 src += 1; | 760 src += 1; |
| 761 dst += 1; | 761 dst += 1; |
| 762 } while (--count > 0); | 762 } while (--count > 0); |
| 763 } | 763 } |
| 764 #endif | 764 #endif |
| 765 | 765 |
| 766 #undef UNROLL | 766 #undef UNROLL |
| 767 } | 767 } |
| 768 } | 768 } |
| 769 | 769 |
| 770 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
| 771 const SkPMColor* SK_RESTRICT src, |
| 772 int count, U8CPU alpha) { |
| 773 |
| 774 SkASSERT(255 >= alpha); |
| 775 |
| 776 if (count <= 0) { |
| 777 return; |
| 778 } |
| 779 |
| 780 unsigned alpha256 = SkAlpha255To256(alpha); |
| 781 |
| 782 // First deal with odd counts |
| 783 if (count & 1) { |
| 784 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; |
| 785 uint16x8_t vdst_wide, vsrc_wide; |
| 786 unsigned dst_scale; |
| 787 |
| 788 // Load |
| 789 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc),
0)); |
| 790 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst),
0)); |
| 791 |
| 792 // Calc dst_scale |
| 793 dst_scale = vget_lane_u8(vsrc, 3); |
| 794 dst_scale *= alpha256; |
| 795 dst_scale >>= 8; |
| 796 dst_scale = 256 - dst_scale; |
| 797 |
| 798 // Process src |
| 799 vsrc_wide = vmovl_u8(vsrc); |
| 800 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); |
| 801 |
| 802 // Process dst |
| 803 vdst_wide = vmovl_u8(vdst); |
| 804 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); |
| 805 |
| 806 // Combine |
| 807 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
| 808 |
| 809 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); |
| 810 dst++; |
| 811 src++; |
| 812 count--; |
| 813 } |
| 814 |
| 815 if (count) { |
| 816 uint8x8_t alpha_mask; |
| 817 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |
| 818 alpha_mask = vld1_u8(alpha_mask_setup); |
| 819 |
| 820 do { |
| 821 |
| 822 uint8x8_t vsrc, vdst, vres, vsrc_alphas; |
| 823 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; |
| 824 |
| 825 __builtin_prefetch(src+32); |
| 826 __builtin_prefetch(dst+32); |
| 827 |
| 828 // Load |
| 829 vsrc = vreinterpret_u8_u32(vld1_u32(src)); |
| 830 vdst = vreinterpret_u8_u32(vld1_u32(dst)); |
| 831 |
| 832 // Prepare src_scale |
| 833 vsrc_scale = vdupq_n_u16(alpha256); |
| 834 |
| 835 // Calc dst_scale |
| 836 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); |
| 837 vdst_scale = vmovl_u8(vsrc_alphas); |
| 838 vdst_scale *= vsrc_scale; |
| 839 vdst_scale = vshrq_n_u16(vdst_scale, 8); |
| 840 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); |
| 841 |
| 842 // Process src |
| 843 vsrc_wide = vmovl_u8(vsrc); |
| 844 vsrc_wide *= vsrc_scale; |
| 845 |
| 846 // Process dst |
| 847 vdst_wide = vmovl_u8(vdst); |
| 848 vdst_wide *= vdst_scale; |
| 849 |
| 850 // Combine |
| 851 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
| 852 |
| 853 vst1_u32(dst, vreinterpret_u32_u8(vres)); |
| 854 |
| 855 src += 2; |
| 856 dst += 2; |
| 857 count -= 2; |
| 858 } while(count); |
| 859 } |
| 860 } |
| 861 |
| 770 /////////////////////////////////////////////////////////////////////////////// | 862 /////////////////////////////////////////////////////////////////////////////// |
| 771 | 863 |
| 772 #undef DEBUG_OPAQUE_DITHER | 864 #undef DEBUG_OPAQUE_DITHER |
| 773 | 865 |
| 774 #if defined(DEBUG_OPAQUE_DITHER) | 866 #if defined(DEBUG_OPAQUE_DITHER) |
| 775 static void showme8(char *str, void *p, int len) | 867 static void showme8(char *str, void *p, int len) |
| 776 { | 868 { |
| 777 static char buf[256]; | 869 static char buf[256]; |
| 778 char tbuf[32]; | 870 char tbuf[32]; |
| 779 int i; | 871 int i; |
| (...skipping 507 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1287 * the performance will almost certainly be worse. However, for many | 1379 * the performance will almost certainly be worse. However, for many |
| 1288 * common cases the performance is equivalent or better than the standard | 1380 * common cases the performance is equivalent or better than the standard |
| 1289 * case where we do not inspect the src alpha. | 1381 * case where we do not inspect the src alpha. |
| 1290 */ | 1382 */ |
| 1291 #if SK_A32_SHIFT == 24 | 1383 #if SK_A32_SHIFT == 24 |
| 1292 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1384 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
| 1293 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1385 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
| 1294 #else | 1386 #else |
| 1295 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1387 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
| 1296 #endif | 1388 #endif |
| 1297 S32A_Blend_BlitRow32_arm // S32A_Blend | 1389 S32A_Blend_BlitRow32_neon // S32A_Blend |
| 1298 }; | 1390 }; |
| OLD | NEW |