OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm.h" | 8 #include "SkBlitRow_opts_arm.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 408 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
419 | 419 |
420 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; | 420 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |
421 alpha_mask = vld1_u8(alpha_mask_setup); | 421 alpha_mask = vld1_u8(alpha_mask_setup); |
422 | 422 |
423 /* do the NEON unrolled code */ | 423 /* do the NEON unrolled code */ |
424 #define UNROLL 4 | 424 #define UNROLL 4 |
425 while (count >= UNROLL) { | 425 while (count >= UNROLL) { |
426 uint8x8_t src_raw, dst_raw, dst_final; | 426 uint8x8_t src_raw, dst_raw, dst_final; |
427 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; | 427 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; |
428 | 428 |
| 429 /* The two prefetches below may make the code slighlty |
| 430 * slower for small values of count but are worth having |
| 431 * in the general case. |
| 432 */ |
| 433 __builtin_prefetch(src+32); |
| 434 __builtin_prefetch(dst+32); |
| 435 |
429 /* get the source */ | 436 /* get the source */ |
430 src_raw = vreinterpret_u8_u32(vld1_u32(src)); | 437 src_raw = vreinterpret_u8_u32(vld1_u32(src)); |
431 #if UNROLL > 2 | 438 #if UNROLL > 2 |
432 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); | 439 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); |
433 #endif | 440 #endif |
434 | 441 |
435 /* get and hold the dst too */ | 442 /* get and hold the dst too */ |
436 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); | 443 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); |
437 #if UNROLL > 2 | 444 #if UNROLL > 2 |
438 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); | 445 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); |
439 #endif | 446 #endif |
440 | 447 |
441 /* 1st and 2nd bits of the unrolling */ | 448 /* 1st and 2nd bits of the unrolling */ |
442 { | 449 { |
443 uint8x8_t dst_cooked; | 450 uint8x8_t dst_cooked; |
444 uint16x8_t dst_wide; | 451 uint16x8_t dst_wide; |
445 uint8x8_t alpha_narrow; | 452 uint8x8_t alpha_narrow; |
446 uint16x8_t alpha_wide; | 453 uint16x8_t alpha_wide; |
447 | 454 |
448 /* get the alphas spread out properly */ | 455 /* get the alphas spread out properly */ |
449 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); | 456 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); |
450 #if 1 | |
451 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ | |
452 /* we collapsed (255-a)+1 ... */ | |
453 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); | 457 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
454 #else | |
455 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); | |
456 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); | |
457 #endif | |
458 | 458 |
459 /* spread the dest */ | 459 /* spread the dest */ |
460 dst_wide = vmovl_u8(dst_raw); | 460 dst_wide = vmovl_u8(dst_raw); |
461 | 461 |
462 /* alpha mul the dest */ | 462 /* alpha mul the dest */ |
463 dst_wide = vmulq_u16 (dst_wide, alpha_wide); | 463 dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
464 dst_cooked = vshrn_n_u16(dst_wide, 8); | 464 dst_cooked = vshrn_n_u16(dst_wide, 8); |
465 | 465 |
466 /* sum -- ignoring any byte lane overflows */ | 466 /* sum -- ignoring any byte lane overflows */ |
467 dst_final = vadd_u8(src_raw, dst_cooked); | 467 dst_final = vadd_u8(src_raw, dst_cooked); |
468 } | 468 } |
469 | 469 |
470 #if UNROLL > 2 | 470 #if UNROLL > 2 |
471 /* the 3rd and 4th bits of our unrolling */ | 471 /* the 3rd and 4th bits of our unrolling */ |
472 { | 472 { |
473 uint8x8_t dst_cooked; | 473 uint8x8_t dst_cooked; |
474 uint16x8_t dst_wide; | 474 uint16x8_t dst_wide; |
475 uint8x8_t alpha_narrow; | 475 uint8x8_t alpha_narrow; |
476 uint16x8_t alpha_wide; | 476 uint16x8_t alpha_wide; |
477 | 477 |
478 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); | 478 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); |
479 #if 1 | |
480 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ | |
481 /* we collapsed (255-a)+1 ... */ | |
482 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); | 479 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
483 #else | |
484 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); | |
485 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); | |
486 #endif | |
487 | 480 |
488 /* spread the dest */ | 481 /* spread the dest */ |
489 dst_wide = vmovl_u8(dst_raw_2); | 482 dst_wide = vmovl_u8(dst_raw_2); |
490 | 483 |
491 /* alpha mul the dest */ | 484 /* alpha mul the dest */ |
492 dst_wide = vmulq_u16 (dst_wide, alpha_wide); | 485 dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
493 dst_cooked = vshrn_n_u16(dst_wide, 8); | 486 dst_cooked = vshrn_n_u16(dst_wide, 8); |
494 | 487 |
495 /* sum -- ignoring any byte lane overflows */ | 488 /* sum -- ignoring any byte lane overflows */ |
496 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); | 489 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); |
(...skipping 792 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1289 * case where we do not inspect the src alpha. | 1282 * case where we do not inspect the src alpha. |
1290 */ | 1283 */ |
1291 #if SK_A32_SHIFT == 24 | 1284 #if SK_A32_SHIFT == 24 |
1292 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1285 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
1293 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1286 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
1294 #else | 1287 #else |
1295 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1288 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
1296 #endif | 1289 #endif |
1297 S32A_Blend_BlitRow32_arm // S32A_Blend | 1290 S32A_Blend_BlitRow32_arm // S32A_Blend |
1298 }; | 1291 }; |
OLD | NEW |