| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "SkBlitRow_opts_arm.h" | 8 #include "SkBlitRow_opts_arm.h" |
| 9 | 9 |
| 10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
| (...skipping 408 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 419 | 419 |
| 420 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; | 420 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; |
| 421 alpha_mask = vld1_u8(alpha_mask_setup); | 421 alpha_mask = vld1_u8(alpha_mask_setup); |
| 422 | 422 |
| 423 /* do the NEON unrolled code */ | 423 /* do the NEON unrolled code */ |
| 424 #define UNROLL 4 | 424 #define UNROLL 4 |
| 425 while (count >= UNROLL) { | 425 while (count >= UNROLL) { |
| 426 uint8x8_t src_raw, dst_raw, dst_final; | 426 uint8x8_t src_raw, dst_raw, dst_final; |
| 427 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; | 427 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; |
| 428 | 428 |
| 429 /* The two prefetches below may make the code slighlty |
| 430 * slower for small values of count but are worth having |
| 431 * in the general case. |
| 432 */ |
| 433 __builtin_prefetch(src+32); |
| 434 __builtin_prefetch(dst+32); |
| 435 |
| 429 /* get the source */ | 436 /* get the source */ |
| 430 src_raw = vreinterpret_u8_u32(vld1_u32(src)); | 437 src_raw = vreinterpret_u8_u32(vld1_u32(src)); |
| 431 #if UNROLL > 2 | 438 #if UNROLL > 2 |
| 432 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); | 439 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); |
| 433 #endif | 440 #endif |
| 434 | 441 |
| 435 /* get and hold the dst too */ | 442 /* get and hold the dst too */ |
| 436 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); | 443 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); |
| 437 #if UNROLL > 2 | 444 #if UNROLL > 2 |
| 438 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); | 445 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); |
| 439 #endif | 446 #endif |
| 440 | 447 |
| 441 /* 1st and 2nd bits of the unrolling */ | 448 /* 1st and 2nd bits of the unrolling */ |
| 442 { | 449 { |
| 443 uint8x8_t dst_cooked; | 450 uint8x8_t dst_cooked; |
| 444 uint16x8_t dst_wide; | 451 uint16x8_t dst_wide; |
| 445 uint8x8_t alpha_narrow; | 452 uint8x8_t alpha_narrow; |
| 446 uint16x8_t alpha_wide; | 453 uint16x8_t alpha_wide; |
| 447 | 454 |
| 448 /* get the alphas spread out properly */ | 455 /* get the alphas spread out properly */ |
| 449 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); | 456 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); |
| 450 #if 1 | |
| 451 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ | |
| 452 /* we collapsed (255-a)+1 ... */ | |
| 453 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); | 457 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
| 454 #else | |
| 455 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); | |
| 456 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); | |
| 457 #endif | |
| 458 | 458 |
| 459 /* spread the dest */ | 459 /* spread the dest */ |
| 460 dst_wide = vmovl_u8(dst_raw); | 460 dst_wide = vmovl_u8(dst_raw); |
| 461 | 461 |
| 462 /* alpha mul the dest */ | 462 /* alpha mul the dest */ |
| 463 dst_wide = vmulq_u16 (dst_wide, alpha_wide); | 463 dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
| 464 dst_cooked = vshrn_n_u16(dst_wide, 8); | 464 dst_cooked = vshrn_n_u16(dst_wide, 8); |
| 465 | 465 |
| 466 /* sum -- ignoring any byte lane overflows */ | 466 /* sum -- ignoring any byte lane overflows */ |
| 467 dst_final = vadd_u8(src_raw, dst_cooked); | 467 dst_final = vadd_u8(src_raw, dst_cooked); |
| 468 } | 468 } |
| 469 | 469 |
| 470 #if UNROLL > 2 | 470 #if UNROLL > 2 |
| 471 /* the 3rd and 4th bits of our unrolling */ | 471 /* the 3rd and 4th bits of our unrolling */ |
| 472 { | 472 { |
| 473 uint8x8_t dst_cooked; | 473 uint8x8_t dst_cooked; |
| 474 uint16x8_t dst_wide; | 474 uint16x8_t dst_wide; |
| 475 uint8x8_t alpha_narrow; | 475 uint8x8_t alpha_narrow; |
| 476 uint16x8_t alpha_wide; | 476 uint16x8_t alpha_wide; |
| 477 | 477 |
| 478 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); | 478 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); |
| 479 #if 1 | |
| 480 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ | |
| 481 /* we collapsed (255-a)+1 ... */ | |
| 482 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); | 479 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); |
| 483 #else | |
| 484 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); | |
| 485 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); | |
| 486 #endif | |
| 487 | 480 |
| 488 /* spread the dest */ | 481 /* spread the dest */ |
| 489 dst_wide = vmovl_u8(dst_raw_2); | 482 dst_wide = vmovl_u8(dst_raw_2); |
| 490 | 483 |
| 491 /* alpha mul the dest */ | 484 /* alpha mul the dest */ |
| 492 dst_wide = vmulq_u16 (dst_wide, alpha_wide); | 485 dst_wide = vmulq_u16 (dst_wide, alpha_wide); |
| 493 dst_cooked = vshrn_n_u16(dst_wide, 8); | 486 dst_cooked = vshrn_n_u16(dst_wide, 8); |
| 494 | 487 |
| 495 /* sum -- ignoring any byte lane overflows */ | 488 /* sum -- ignoring any byte lane overflows */ |
| 496 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); | 489 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); |
| (...skipping 792 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1289 * case where we do not inspect the src alpha. | 1282 * case where we do not inspect the src alpha. |
| 1290 */ | 1283 */ |
| 1291 #if SK_A32_SHIFT == 24 | 1284 #if SK_A32_SHIFT == 24 |
| 1292 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1285 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
| 1293 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1286 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
| 1294 #else | 1287 #else |
| 1295 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1288 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
| 1296 #endif | 1289 #endif |
| 1297 S32A_Blend_BlitRow32_arm // S32A_Blend | 1290 S32A_Blend_BlitRow32_arm // S32A_Blend |
| 1298 }; | 1291 }; |
| OLD | NEW |