Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(370)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 18459008: ARM Skia NEON patches - 13 - S32A_Opaque (Closed) Base URL: https://skia.googlecode.com/svn/trunk
Patch Set: Add the requested comment. Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm.h" 8 #include "SkBlitRow_opts_arm.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
(...skipping 408 matching lines...) Expand 10 before | Expand all | Expand 10 after
419 419
420 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 420 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
421 alpha_mask = vld1_u8(alpha_mask_setup); 421 alpha_mask = vld1_u8(alpha_mask_setup);
422 422
423 /* do the NEON unrolled code */ 423 /* do the NEON unrolled code */
424 #define UNROLL 4 424 #define UNROLL 4
425 while (count >= UNROLL) { 425 while (count >= UNROLL) {
426 uint8x8_t src_raw, dst_raw, dst_final; 426 uint8x8_t src_raw, dst_raw, dst_final;
427 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 427 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
428 428
429 /* The two prefetches below may make the code slighlty
430 * slower for small values of count but are worth having
431 * in the general case.
432 */
433 __builtin_prefetch(src+32);
434 __builtin_prefetch(dst+32);
435
429 /* get the source */ 436 /* get the source */
430 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 437 src_raw = vreinterpret_u8_u32(vld1_u32(src));
431 #if UNROLL > 2 438 #if UNROLL > 2
432 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 439 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
433 #endif 440 #endif
434 441
435 /* get and hold the dst too */ 442 /* get and hold the dst too */
436 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 443 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
437 #if UNROLL > 2 444 #if UNROLL > 2
438 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 445 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
439 #endif 446 #endif
440 447
441 /* 1st and 2nd bits of the unrolling */ 448 /* 1st and 2nd bits of the unrolling */
442 { 449 {
443 uint8x8_t dst_cooked; 450 uint8x8_t dst_cooked;
444 uint16x8_t dst_wide; 451 uint16x8_t dst_wide;
445 uint8x8_t alpha_narrow; 452 uint8x8_t alpha_narrow;
446 uint16x8_t alpha_wide; 453 uint16x8_t alpha_wide;
447 454
448 /* get the alphas spread out properly */ 455 /* get the alphas spread out properly */
449 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 456 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
450 #if 1
451 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
452 /* we collapsed (255-a)+1 ... */
453 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 457 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
454 #else
455 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
456 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
457 #endif
458 458
459 /* spread the dest */ 459 /* spread the dest */
460 dst_wide = vmovl_u8(dst_raw); 460 dst_wide = vmovl_u8(dst_raw);
461 461
462 /* alpha mul the dest */ 462 /* alpha mul the dest */
463 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 463 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
464 dst_cooked = vshrn_n_u16(dst_wide, 8); 464 dst_cooked = vshrn_n_u16(dst_wide, 8);
465 465
466 /* sum -- ignoring any byte lane overflows */ 466 /* sum -- ignoring any byte lane overflows */
467 dst_final = vadd_u8(src_raw, dst_cooked); 467 dst_final = vadd_u8(src_raw, dst_cooked);
468 } 468 }
469 469
470 #if UNROLL > 2 470 #if UNROLL > 2
471 /* the 3rd and 4th bits of our unrolling */ 471 /* the 3rd and 4th bits of our unrolling */
472 { 472 {
473 uint8x8_t dst_cooked; 473 uint8x8_t dst_cooked;
474 uint16x8_t dst_wide; 474 uint16x8_t dst_wide;
475 uint8x8_t alpha_narrow; 475 uint8x8_t alpha_narrow;
476 uint16x8_t alpha_wide; 476 uint16x8_t alpha_wide;
477 477
478 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 478 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
479 #if 1
480 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
481 /* we collapsed (255-a)+1 ... */
482 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 479 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
483 #else
484 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
485 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
486 #endif
487 480
488 /* spread the dest */ 481 /* spread the dest */
489 dst_wide = vmovl_u8(dst_raw_2); 482 dst_wide = vmovl_u8(dst_raw_2);
490 483
491 /* alpha mul the dest */ 484 /* alpha mul the dest */
492 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 485 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
493 dst_cooked = vshrn_n_u16(dst_wide, 8); 486 dst_cooked = vshrn_n_u16(dst_wide, 8);
494 487
495 /* sum -- ignoring any byte lane overflows */ 488 /* sum -- ignoring any byte lane overflows */
496 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 489 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
(...skipping 792 matching lines...) Expand 10 before | Expand all | Expand 10 after
1289 * case where we do not inspect the src alpha. 1282 * case where we do not inspect the src alpha.
1290 */ 1283 */
1291 #if SK_A32_SHIFT == 24 1284 #if SK_A32_SHIFT == 24
1292 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1285 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1293 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1286 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1294 #else 1287 #else
1295 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1288 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1296 #endif 1289 #endif
1297 S32A_Blend_BlitRow32_arm // S32A_Blend 1290 S32A_Blend_BlitRow32_arm // S32A_Blend
1298 }; 1291 };
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698