src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 18614010: ARM Skia NEON patches - 14 - S32A_Blend

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 18614010: ARM Skia NEON patches - 14 - S32A_Blend (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Fix uninitilized variable warnings Created 7 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm.h"	8 #include "SkBlitRow_opts_arm.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 749 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
760 src += 1;	760 src += 1;

761 dst += 1;	761 dst += 1;

762 } while (--count > 0);	762 } while (--count > 0);

763 }	763 }

764 #endif	764 #endif

765	765

766 #undef UNROLL	766 #undef UNROLL

767 }	767 }

768 }	768 }

769	769

	770 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

	771 const SkPMColor* SK_RESTRICT src,

	772 int count, U8CPU alpha) {

	773

	774 SkASSERT(255 >= alpha);

	775

	776 if (count <= 0) {

	777 return;

	778 }

	779

	780 unsigned alpha256 = SkAlpha255To256(alpha);

	781

	782 // First deal with odd counts

	783 if (count & 1) {

	784 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;

	785 uint16x8_t vdst_wide, vsrc_wide;

	786 unsigned dst_scale;

	787

	788 // Load

	789 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));

	790 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));

	791

	792 // Calc dst_scale

	793 dst_scale = vget_lane_u8(vsrc, 3);

	794 dst_scale *= alpha256;

	795 dst_scale >>= 8;

	796 dst_scale = 256 - dst_scale;

	797

	798 // Process src

	799 vsrc_wide = vmovl_u8(vsrc);

	800 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);

	801

	802 // Process dst

	803 vdst_wide = vmovl_u8(vdst);

	804 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);

	805

	806 // Combine

	807 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

	808

	809 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);

	810 dst++;

	811 src++;

	812 count--;

	813 }

	814

	815 if (count) {

	816 uint8x8_t alpha_mask;

	817 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};

	818 alpha_mask = vld1_u8(alpha_mask_setup);

	819

	820 do {

	821

	822 uint8x8_t vsrc, vdst, vres, vsrc_alphas;

	823 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;

	824

	825 __builtin_prefetch(src+32);

	826 __builtin_prefetch(dst+32);

	827

	828 // Load

	829 vsrc = vreinterpret_u8_u32(vld1_u32(src));

	830 vdst = vreinterpret_u8_u32(vld1_u32(dst));

	831

	832 // Prepare src_scale

	833 vsrc_scale = vdupq_n_u16(alpha256);

	834

	835 // Calc dst_scale

	836 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);

	837 vdst_scale = vmovl_u8(vsrc_alphas);

	838 vdst_scale *= vsrc_scale;

	839 vdst_scale = vshrq_n_u16(vdst_scale, 8);

	840 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);

	841

	842 // Process src

	843 vsrc_wide = vmovl_u8(vsrc);

	844 vsrc_wide *= vsrc_scale;

	845

	846 // Process dst

	847 vdst_wide = vmovl_u8(vdst);

	848 vdst_wide *= vdst_scale;

	849

	850 // Combine

	851 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

	852

	853 vst1_u32(dst, vreinterpret_u32_u8(vres));

	854

	855 src += 2;

	856 dst += 2;

	857 count -= 2;

	858 } while(count);

	859 }

	860 }

	861

770 ///////////////////////////////////////////////////////////////////////////////	862 ///////////////////////////////////////////////////////////////////////////////

771	863

772 #undef DEBUG_OPAQUE_DITHER	864 #undef DEBUG_OPAQUE_DITHER

773	865

774 #if defined(DEBUG_OPAQUE_DITHER)	866 #if defined(DEBUG_OPAQUE_DITHER)

775 static void showme8(char str, void p, int len)	867 static void showme8(char str, void p, int len)

776 {	868 {

777 static char buf[256];	869 static char buf[256];

778 char tbuf[32];	870 char tbuf[32];

779 int i;	871 int i;

(...skipping 507 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1287 * the performance will almost certainly be worse. However, for many	1379 * the performance will almost certainly be worse. However, for many

1288 * common cases the performance is equivalent or better than the standard	1380 * common cases the performance is equivalent or better than the standard

1289 * case where we do not inspect the src alpha.	1381 * case where we do not inspect the src alpha.

1290 */	1382 */

1291 #if SK_A32_SHIFT == 24	1383 #if SK_A32_SHIFT == 24

1292 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1384 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1293 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1385 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1294 #else	1386 #else

1295 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1387 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1296 #endif	1388 #endif

1297 S32A_Blend_BlitRow32_arm // S32A_Blend	1389 S32A_Blend_BlitRow32_neon // S32A_Blend

1298 };	1390 };

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »