OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm.h" | 8 #include "SkBlitRow_opts_arm.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 749 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
760 src += 1; | 760 src += 1; |
761 dst += 1; | 761 dst += 1; |
762 } while (--count > 0); | 762 } while (--count > 0); |
763 } | 763 } |
764 #endif | 764 #endif |
765 | 765 |
766 #undef UNROLL | 766 #undef UNROLL |
767 } | 767 } |
768 } | 768 } |
769 | 769 |
| 770 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
| 771 const SkPMColor* SK_RESTRICT src, |
| 772 int count, U8CPU alpha) { |
| 773 |
| 774 SkASSERT(255 >= alpha); |
| 775 |
| 776 if (count <= 0) { |
| 777 return; |
| 778 } |
| 779 |
| 780 unsigned alpha256 = SkAlpha255To256(alpha); |
| 781 |
| 782 // First deal with odd counts |
| 783 if (count & 1) { |
| 784 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; |
| 785 uint16x8_t vdst_wide, vsrc_wide; |
| 786 unsigned dst_scale; |
| 787 |
| 788 // Load |
| 789 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc),
0)); |
| 790 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst),
0)); |
| 791 |
| 792 // Calc dst_scale |
| 793 dst_scale = vget_lane_u8(vsrc, 3); |
| 794 dst_scale *= alpha256; |
| 795 dst_scale >>= 8; |
| 796 dst_scale = 256 - dst_scale; |
| 797 |
| 798 // Process src |
| 799 vsrc_wide = vmovl_u8(vsrc); |
| 800 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); |
| 801 |
| 802 // Process dst |
| 803 vdst_wide = vmovl_u8(vdst); |
| 804 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); |
| 805 |
| 806 // Combine |
| 807 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
| 808 |
| 809 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); |
| 810 dst++; |
| 811 src++; |
| 812 count--; |
| 813 } |
| 814 |
| 815 if (count) { |
| 816 uint8x8_t alpha_mask; |
| 817 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |
| 818 alpha_mask = vld1_u8(alpha_mask_setup); |
| 819 |
| 820 do { |
| 821 |
| 822 uint8x8_t vsrc, vdst, vres, vsrc_alphas; |
| 823 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; |
| 824 |
| 825 __builtin_prefetch(src+32); |
| 826 __builtin_prefetch(dst+32); |
| 827 |
| 828 // Load |
| 829 vsrc = vreinterpret_u8_u32(vld1_u32(src)); |
| 830 vdst = vreinterpret_u8_u32(vld1_u32(dst)); |
| 831 |
| 832 // Prepare src_scale |
| 833 vsrc_scale = vdupq_n_u16(alpha256); |
| 834 |
| 835 // Calc dst_scale |
| 836 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); |
| 837 vdst_scale = vmovl_u8(vsrc_alphas); |
| 838 vdst_scale *= vsrc_scale; |
| 839 vdst_scale = vshrq_n_u16(vdst_scale, 8); |
| 840 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); |
| 841 |
| 842 // Process src |
| 843 vsrc_wide = vmovl_u8(vsrc); |
| 844 vsrc_wide *= vsrc_scale; |
| 845 |
| 846 // Process dst |
| 847 vdst_wide = vmovl_u8(vdst); |
| 848 vdst_wide *= vdst_scale; |
| 849 |
| 850 // Combine |
| 851 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); |
| 852 |
| 853 vst1_u32(dst, vreinterpret_u32_u8(vres)); |
| 854 |
| 855 src += 2; |
| 856 dst += 2; |
| 857 count -= 2; |
| 858 } while(count); |
| 859 } |
| 860 } |
| 861 |
770 /////////////////////////////////////////////////////////////////////////////// | 862 /////////////////////////////////////////////////////////////////////////////// |
771 | 863 |
772 #undef DEBUG_OPAQUE_DITHER | 864 #undef DEBUG_OPAQUE_DITHER |
773 | 865 |
774 #if defined(DEBUG_OPAQUE_DITHER) | 866 #if defined(DEBUG_OPAQUE_DITHER) |
775 static void showme8(char *str, void *p, int len) | 867 static void showme8(char *str, void *p, int len) |
776 { | 868 { |
777 static char buf[256]; | 869 static char buf[256]; |
778 char tbuf[32]; | 870 char tbuf[32]; |
779 int i; | 871 int i; |
(...skipping 507 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1287 * the performance will almost certainly be worse. However, for many | 1379 * the performance will almost certainly be worse. However, for many |
1288 * common cases the performance is equivalent or better than the standard | 1380 * common cases the performance is equivalent or better than the standard |
1289 * case where we do not inspect the src alpha. | 1381 * case where we do not inspect the src alpha. |
1290 */ | 1382 */ |
1291 #if SK_A32_SHIFT == 24 | 1383 #if SK_A32_SHIFT == 24 |
1292 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1384 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
1293 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1385 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
1294 #else | 1386 #else |
1295 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1387 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
1296 #endif | 1388 #endif |
1297 S32A_Blend_BlitRow32_arm // S32A_Blend | 1389 S32A_Blend_BlitRow32_neon // S32A_Blend |
1298 }; | 1390 }; |
OLD | NEW |