OLD | NEW |
1 | 1 |
2 /* | 2 /* |
3 * Copyright 2006 The Android Open Source Project | 3 * Copyright 2006 The Android Open Source Project |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license that can be | 5 * Use of this source code is governed by a BSD-style license that can be |
6 * found in the LICENSE file. | 6 * found in the LICENSE file. |
7 */ | 7 */ |
8 | 8 |
9 | 9 |
10 #include "SkBlitRow.h" | 10 #include "SkBlitRow.h" |
(...skipping 372 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
383 int height = clip.height(); | 383 int height = clip.height(); |
384 size_t deviceRB = fDevice.rowBytes() - (width << 1); | 384 size_t deviceRB = fDevice.rowBytes() - (width << 1); |
385 unsigned maskRB = mask.fRowBytes - width; | 385 unsigned maskRB = mask.fRowBytes - width; |
386 uint32_t expanded32 = fExpandedRaw16; | 386 uint32_t expanded32 = fExpandedRaw16; |
387 | 387 |
388 #ifdef SK_USE_NEON | 388 #ifdef SK_USE_NEON |
389 #define UNROLL 8 | 389 #define UNROLL 8 |
390 do { | 390 do { |
391 int w = width; | 391 int w = width; |
392 if (w >= UNROLL) { | 392 if (w >= UNROLL) { |
393 uint32x4_t color; /* can use same one */ | 393 uint32x4_t color, dev_lo, dev_hi; |
394 uint32x4_t dev_lo, dev_hi; | 394 uint32x4_t wn1, wn2, tmp; |
395 uint32x4_t t1; | 395 uint32x4_t vmask_g16, vmask_ng16; |
396 uint32x4_t wn1, wn2; | 396 uint16x8_t valpha, vdev; |
397 uint16x4_t odev_lo, odev_hi; | 397 uint16x4_t odev_lo, odev_hi, valpha_lo, valpha_hi; |
398 uint16x4_t alpha_lo, alpha_hi; | |
399 uint16x8_t alpha_full; | |
400 | 398 |
| 399 // prepare constants |
| 400 vmask_g16 = vdupq_n_u32(SK_G16_MASK_IN_PLACE); |
| 401 vmask_ng16 = vdupq_n_u32(~SK_G16_MASK_IN_PLACE); |
401 color = vdupq_n_u32(expanded32); | 402 color = vdupq_n_u32(expanded32); |
402 | 403 |
403 do { | 404 do { |
404 /* alpha is 8x8, widen and split to get pair of 16x4's */ | 405 // alpha is 8x8, widen and split to get a pair of 16x4 |
405 alpha_full = vmovl_u8(vld1_u8(alpha)); | 406 valpha = vaddw_u8(vdupq_n_u16(1), vld1_u8(alpha)); |
406 alpha_full = vaddq_u16(alpha_full, vshrq_n_u16(alpha_full,7)); | 407 valpha = vshrq_n_u16(valpha, 3); |
407 alpha_full = vshrq_n_u16(alpha_full, 3); | 408 valpha_lo = vget_low_u16(valpha); |
408 alpha_lo = vget_low_u16(alpha_full); | 409 valpha_hi = vget_high_u16(valpha); |
409 alpha_hi = vget_high_u16(alpha_full); | |
410 | 410 |
411 dev_lo = vmovl_u16(vld1_u16(device)); | 411 // load pixels |
412 dev_hi = vmovl_u16(vld1_u16(device+4)); | 412 vdev = vld1q_u16(device); |
| 413 dev_lo = vmovl_u16(vget_low_u16(vdev)); |
| 414 dev_hi = vmovl_u16(vget_high_u16(vdev)); |
413 | 415 |
414 /* unpack in 32 bits */ | 416 // unpack them in 32 bits |
415 dev_lo = vorrq_u32( | 417 dev_lo = (dev_lo & vmask_ng16) | vshlq_n_u32(dev_lo & vmask_g16,
16); |
416 vandq_u32(dev_lo, vdupq_n_u32(0x0000F81F)), | 418 dev_hi = (dev_hi & vmask_ng16) | vshlq_n_u32(dev_hi & vmask_g16,
16); |
417 vshlq_n_u32(vandq_u32(dev_lo, | |
418 vdupq_n_u32(0x000007E0)
), | |
419 16) | |
420 ); | |
421 dev_hi = vorrq_u32( | |
422 vandq_u32(dev_hi, vdupq_n_u32(0x0000F81F)), | |
423 vshlq_n_u32(vandq_u32(dev_hi, | |
424 vdupq_n_u32(0x000007E0)
), | |
425 16) | |
426 ); | |
427 | 419 |
428 /* blend the two */ | 420 // blend with color |
429 t1 = vmulq_u32(vsubq_u32(color, dev_lo), vmovl_u16(alpha_lo)); | 421 tmp = (color - dev_lo) * vmovl_u16(valpha_lo); |
430 t1 = vshrq_n_u32(t1, 5); | 422 tmp = vshrq_n_u32(tmp, 5); |
431 dev_lo = vaddq_u32(dev_lo, t1); | 423 dev_lo += tmp; |
432 | 424 |
433 t1 = vmulq_u32(vsubq_u32(color, dev_hi), vmovl_u16(alpha_hi)); | 425 tmp = vmulq_u32(color - dev_hi, vmovl_u16(valpha_hi)); |
434 t1 = vshrq_n_u32(t1, 5); | 426 tmp = vshrq_n_u32(tmp, 5); |
435 dev_hi = vaddq_u32(dev_hi, t1); | 427 dev_hi += tmp; |
436 | 428 |
437 /* re-compact and store */ | 429 // re-compact |
438 wn1 = vandq_u32(dev_lo, vdupq_n_u32(0x0000F81F)), | 430 wn1 = dev_lo & vmask_ng16; |
439 wn2 = vshrq_n_u32(dev_lo, 16); | 431 wn2 = vshrq_n_u32(dev_lo, 16) & vmask_g16; |
440 wn2 = vandq_u32(wn2, vdupq_n_u32(0x000007E0)); | 432 odev_lo = vmovn_u32(wn1 | wn2); |
441 odev_lo = vmovn_u32(vorrq_u32(wn1, wn2)); | |
442 | 433 |
443 wn1 = vandq_u32(dev_hi, vdupq_n_u32(0x0000F81F)), | 434 wn1 = dev_hi & vmask_ng16; |
444 wn2 = vshrq_n_u32(dev_hi, 16); | 435 wn2 = vshrq_n_u32(dev_hi, 16) & vmask_g16; |
445 wn2 = vandq_u32(wn2, vdupq_n_u32(0x000007E0)); | 436 odev_hi = vmovn_u32(wn1 | wn2); |
446 odev_hi = vmovn_u32(vorrq_u32(wn1, wn2)); | |
447 | 437 |
448 vst1_u16(device, odev_lo); | 438 // store |
449 vst1_u16(device+4, odev_hi); | 439 vst1q_u16(device, vcombine_u16(odev_lo, odev_hi)); |
450 | 440 |
451 device += UNROLL; | 441 device += UNROLL; |
452 alpha += UNROLL; | 442 alpha += UNROLL; |
453 w -= UNROLL; | 443 w -= UNROLL; |
454 } while (w >= UNROLL); | 444 } while (w >= UNROLL); |
455 } | 445 } |
456 | 446 |
457 /* residuals (which is everything if we have no neon) */ | 447 // residuals |
458 while (w > 0) { | 448 while (w > 0) { |
459 *device = blend_compact(expanded32, SkExpand_rgb_16(*device), | 449 *device = blend_compact(expanded32, SkExpand_rgb_16(*device), |
460 SkAlpha255To256(*alpha++) >> 3); | 450 SkAlpha255To256(*alpha++) >> 3); |
461 device += 1; | 451 device += 1; |
462 --w; | 452 --w; |
463 } | 453 } |
464 device = (uint16_t*)((char*)device + deviceRB); | 454 device = (uint16_t*)((char*)device + deviceRB); |
465 alpha += maskRB; | 455 alpha += maskRB; |
466 } while (--height != 0); | 456 } while (--height != 0); |
467 #undef UNROLL | 457 #undef UNROLL |
(...skipping 588 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1056 SK_PLACEMENT_NEW_ARGS(blitter, SkRGB16_Opaque_Blitter, storage, | 1046 SK_PLACEMENT_NEW_ARGS(blitter, SkRGB16_Opaque_Blitter, storage, |
1057 storageSize, (device, paint)); | 1047 storageSize, (device, paint)); |
1058 } else { | 1048 } else { |
1059 SK_PLACEMENT_NEW_ARGS(blitter, SkRGB16_Blitter, storage, | 1049 SK_PLACEMENT_NEW_ARGS(blitter, SkRGB16_Blitter, storage, |
1060 storageSize, (device, paint)); | 1050 storageSize, (device, paint)); |
1061 } | 1051 } |
1062 } | 1052 } |
1063 | 1053 |
1064 return blitter; | 1054 return blitter; |
1065 } | 1055 } |
OLD | NEW |