src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 18459008: ARM Skia NEON patches - 13 - S32A_Opaque

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 18459008: ARM Skia NEON patches - 13 - S32A_Opaque (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Add the requested comment. Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm.h"	8 #include "SkBlitRow_opts_arm.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 408 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
419	419

420 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};	420 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};

421 alpha_mask = vld1_u8(alpha_mask_setup);	421 alpha_mask = vld1_u8(alpha_mask_setup);

422	422

423 /* do the NEON unrolled code */	423 /* do the NEON unrolled code */

424 #define UNROLL 4	424 #define UNROLL 4

425 while (count >= UNROLL) {	425 while (count >= UNROLL) {

426 uint8x8_t src_raw, dst_raw, dst_final;	426 uint8x8_t src_raw, dst_raw, dst_final;

427 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;	427 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;

428	428

	429 /* The two prefetches below may make the code slighlty

	430 * slower for small values of count but are worth having

	431 * in the general case.

	432 */

	433 __builtin_prefetch(src+32);

	434 __builtin_prefetch(dst+32);

	435

429 /* get the source */	436 /* get the source */

430 src_raw = vreinterpret_u8_u32(vld1_u32(src));	437 src_raw = vreinterpret_u8_u32(vld1_u32(src));

431 #if UNROLL > 2	438 #if UNROLL > 2

432 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));	439 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));

433 #endif	440 #endif

434	441

435 /* get and hold the dst too */	442 /* get and hold the dst too */

436 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));	443 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));

437 #if UNROLL > 2	444 #if UNROLL > 2

438 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));	445 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));

439 #endif	446 #endif

440	447

441 /* 1st and 2nd bits of the unrolling */	448 /* 1st and 2nd bits of the unrolling */

442 {	449 {

443 uint8x8_t dst_cooked;	450 uint8x8_t dst_cooked;

444 uint16x8_t dst_wide;	451 uint16x8_t dst_wide;

445 uint8x8_t alpha_narrow;	452 uint8x8_t alpha_narrow;

446 uint16x8_t alpha_wide;	453 uint16x8_t alpha_wide;

447	454

448 /* get the alphas spread out properly */	455 /* get the alphas spread out properly */

449 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);	456 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);

450 #if 1

451 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */

452 /* we collapsed (255-a)+1 ... */

453 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);	457 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

454 #else

455 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);

456 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));

457 #endif

458	458

459 /* spread the dest */	459 /* spread the dest */

460 dst_wide = vmovl_u8(dst_raw);	460 dst_wide = vmovl_u8(dst_raw);

461	461

462 /* alpha mul the dest */	462 /* alpha mul the dest */

463 dst_wide = vmulq_u16 (dst_wide, alpha_wide);	463 dst_wide = vmulq_u16 (dst_wide, alpha_wide);

464 dst_cooked = vshrn_n_u16(dst_wide, 8);	464 dst_cooked = vshrn_n_u16(dst_wide, 8);

465	465

466 /* sum -- ignoring any byte lane overflows */	466 /* sum -- ignoring any byte lane overflows */

467 dst_final = vadd_u8(src_raw, dst_cooked);	467 dst_final = vadd_u8(src_raw, dst_cooked);

468 }	468 }

469	469

470 #if UNROLL > 2	470 #if UNROLL > 2

471 /* the 3rd and 4th bits of our unrolling */	471 /* the 3rd and 4th bits of our unrolling */

472 {	472 {

473 uint8x8_t dst_cooked;	473 uint8x8_t dst_cooked;

474 uint16x8_t dst_wide;	474 uint16x8_t dst_wide;

475 uint8x8_t alpha_narrow;	475 uint8x8_t alpha_narrow;

476 uint16x8_t alpha_wide;	476 uint16x8_t alpha_wide;

477	477

478 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);	478 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);

479 #if 1

480 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */

481 /* we collapsed (255-a)+1 ... */

482 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);	479 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

483 #else

484 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);

485 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));

486 #endif

487	480

488 /* spread the dest */	481 /* spread the dest */

489 dst_wide = vmovl_u8(dst_raw_2);	482 dst_wide = vmovl_u8(dst_raw_2);

490	483

491 /* alpha mul the dest */	484 /* alpha mul the dest */

492 dst_wide = vmulq_u16 (dst_wide, alpha_wide);	485 dst_wide = vmulq_u16 (dst_wide, alpha_wide);

493 dst_cooked = vshrn_n_u16(dst_wide, 8);	486 dst_cooked = vshrn_n_u16(dst_wide, 8);

494	487

495 /* sum -- ignoring any byte lane overflows */	488 /* sum -- ignoring any byte lane overflows */

496 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);	489 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);

(...skipping 792 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1289 * case where we do not inspect the src alpha.	1282 * case where we do not inspect the src alpha.

1290 */	1283 */

1291 #if SK_A32_SHIFT == 24	1284 #if SK_A32_SHIFT == 24

1292 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1285 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1293 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1286 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1294 #else	1287 #else

1295 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1288 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1296 #endif	1289 #endif

1297 S32A_Blend_BlitRow32_arm // S32A_Blend	1290 S32A_Blend_BlitRow32_arm // S32A_Blend

1298 };	1291 };

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »