OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 | 8 |
9 #include "SkBlitRow_opts_SSE2.h" | 9 #include "SkBlitRow_opts_SSE2.h" |
10 #include "SkBitmapProcState_opts_SSE2.h" | 10 #include "SkBitmapProcState_opts_SSE2.h" |
(...skipping 526 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
537 #endif | 537 #endif |
538 | 538 |
539 #if SK_B16x5_B32x5_SHIFT == 0 | 539 #if SK_B16x5_B32x5_SHIFT == 0 |
540 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) | 540 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) |
541 #elif SK_B16x5_B32x5_SHIFT > 0 | 541 #elif SK_B16x5_B32x5_SHIFT > 0 |
542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32
x5_SHIFT)) | 542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32
x5_SHIFT)) |
543 #else | 543 #else |
544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B3
2x5_SHIFT)) | 544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B3
2x5_SHIFT)) |
545 #endif | 545 #endif |
546 | 546 |
547 static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst, | 547 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, |
548 __m128i &mask, __m128i &scale) { | 548 __m128i &mask, __m128i &srcA) { |
| 549 // In the following comments, the components of src, dst and mask are |
| 550 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked |
| 551 // by an R, G, B, or A suffix. Components of one of the four pixels that |
| 552 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for |
| 553 // example is the blue channel of the second destination pixel. Memory |
| 554 // layout is shown for an ARGB byte order in a color value. |
| 555 |
| 556 // src and srcA store 8-bit values interleaved with zeros. |
| 557 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) |
| 558 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, |
| 559 // srcA, 0, srcA, 0, srcA, 0, srcA, 0) |
| 560 // mask stores 16-bit values (compressed three channels) interleaved with ze
ros. |
| 561 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. |
| 562 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
| 563 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
| 564 |
549 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. | 565 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. |
| 566 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) |
550 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), | 567 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), |
551 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); | 568 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); |
552 | 569 |
| 570 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) |
553 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), | 571 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), |
554 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); | 572 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); |
555 | 573 |
| 574 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) |
556 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), | 575 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), |
557 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); | 576 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); |
558 | 577 |
559 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) | 578 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) |
| 579 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an |
| 580 // 8-bit position |
| 581 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, |
| 582 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) |
560 mask = _mm_or_si128(_mm_or_si128(r, g), b); | 583 mask = _mm_or_si128(_mm_or_si128(r, g), b); |
561 | 584 |
562 // Interleave R,G,B into the lower byte of word. | 585 // Interleave R,G,B into the lower byte of word. |
| 586 // i.e. split the sixteen 8-bit values from mask into two sets of eight |
| 587 // 16-bit values, padded by zero. |
563 __m128i maskLo, maskHi; | 588 __m128i maskLo, maskHi; |
| 589 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) |
564 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); | 590 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); |
| 591 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) |
565 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); | 592 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); |
566 | 593 |
567 // Upscale to 0..32 | 594 // Upscale from 0..31 to 0..32 |
| 595 // (allows to replace division by left-shift further down) |
| 596 // Left-shift each component by 4 and add the result back to that component, |
| 597 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 |
568 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); | 598 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); |
569 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); | 599 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); |
570 | 600 |
571 maskLo = _mm_mullo_epi16(maskLo, scale); | 601 // Multiply each component of maskLo and maskHi by srcA |
572 maskHi = _mm_mullo_epi16(maskHi, scale); | 602 maskLo = _mm_mullo_epi16(maskLo, srcA); |
573 | 603 maskHi = _mm_mullo_epi16(maskHi, srcA); |
| 604 |
| 605 // Left shift mask components by 8 (divide by 256) |
574 maskLo = _mm_srli_epi16(maskLo, 8); | 606 maskLo = _mm_srli_epi16(maskLo, 8); |
575 maskHi = _mm_srli_epi16(maskHi, 8); | 607 maskHi = _mm_srli_epi16(maskHi, 8); |
576 | 608 |
577 // Interleave R,G,B into the lower byte of the word. | 609 // Interleave R,G,B into the lower byte of the word |
| 610 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) |
578 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); | 611 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); |
| 612 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) |
579 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); | 613 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); |
580 | 614 |
581 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo)); | 615 // mask = (src - dst) * mask |
582 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi)); | 616 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); |
583 | 617 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); |
| 618 |
| 619 // mask = (src - dst) * mask >> 5 |
584 maskLo = _mm_srai_epi16(maskLo, 5); | 620 maskLo = _mm_srai_epi16(maskLo, 5); |
585 maskHi = _mm_srai_epi16(maskHi, 5); | 621 maskHi = _mm_srai_epi16(maskHi, 5); |
586 | 622 |
587 // Add two pixels into result. | 623 // Add two pixels into result. |
| 624 // result = dst + ((src - dst) * mask >> 5) |
588 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); | 625 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); |
589 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); | 626 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); |
590 | 627 |
591 // Pack into 4 32bit dst pixels | 628 // Pack into 4 32bit dst pixels. |
| 629 // resultLo and resultHi contain eight 16-bit components (two pixels) each. |
| 630 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), |
| 631 // clamping to 255 if necessary. |
592 return _mm_packus_epi16(resultLo, resultHi); | 632 return _mm_packus_epi16(resultLo, resultHi); |
593 } | 633 } |
594 | 634 |
595 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst, | 635 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, |
596 __m128i &mask) { | 636 __m128i &mask) { |
| 637 // In the following comments, the components of src, dst and mask are |
| 638 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked |
| 639 // by an R, G, B, or A suffix. Components of one of the four pixels that |
| 640 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for |
| 641 // example is the blue channel of the second destination pixel. Memory |
| 642 // layout is shown for an ARGB byte order in a color value. |
| 643 |
| 644 // src and srcA store 8-bit values interleaved with zeros. |
| 645 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) |
| 646 // mask stores 16-bit values (shown as high and low bytes) interleaved with |
| 647 // zeros |
| 648 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
| 649 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
| 650 |
597 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. | 651 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. |
| 652 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) |
598 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), | 653 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), |
599 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); | 654 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); |
600 | 655 |
| 656 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) |
601 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), | 657 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), |
602 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); | 658 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); |
603 | 659 |
| 660 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) |
604 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), | 661 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), |
605 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); | 662 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); |
606 | 663 |
607 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) | 664 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) |
| 665 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an |
| 666 // 8-bit position |
| 667 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, |
| 668 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) |
608 mask = _mm_or_si128(_mm_or_si128(r, g), b); | 669 mask = _mm_or_si128(_mm_or_si128(r, g), b); |
609 | 670 |
610 // Interleave R,G,B into the lower byte of word. | 671 // Interleave R,G,B into the lower byte of word. |
| 672 // i.e. split the sixteen 8-bit values from mask into two sets of eight |
| 673 // 16-bit values, padded by zero. |
611 __m128i maskLo, maskHi; | 674 __m128i maskLo, maskHi; |
| 675 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) |
612 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); | 676 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); |
| 677 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) |
613 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); | 678 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); |
614 | 679 |
615 // Upscale to 0..32 | 680 // Upscale from 0..31 to 0..32 |
| 681 // (allows to replace division by left-shift further down) |
| 682 // Left-shift each component by 4 and add the result back to that component, |
| 683 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 |
616 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); | 684 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); |
617 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); | 685 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); |
618 | 686 |
619 // Interleave R,G,B into the lower byte of the word. | 687 // Interleave R,G,B into the lower byte of the word |
| 688 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) |
620 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); | 689 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); |
| 690 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) |
621 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); | 691 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); |
622 | 692 |
623 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo)); | 693 // mask = (src - dst) * mask |
624 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi)); | 694 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); |
625 | 695 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); |
| 696 |
| 697 // mask = (src - dst) * mask >> 5 |
626 maskLo = _mm_srai_epi16(maskLo, 5); | 698 maskLo = _mm_srai_epi16(maskLo, 5); |
627 maskHi = _mm_srai_epi16(maskHi, 5); | 699 maskHi = _mm_srai_epi16(maskHi, 5); |
628 | 700 |
629 // Add two pixels into result. | 701 // Add two pixels into result. |
| 702 // result = dst + ((src - dst) * mask >> 5) |
630 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); | 703 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); |
631 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); | 704 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); |
632 | 705 |
633 // Pack into 4 32bit dst pixels and force opaque. | 706 // Pack into 4 32bit dst pixels and force opaque. |
| 707 // resultLo and resultHi contain eight 16-bit components (two pixels) each. |
| 708 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), |
| 709 // clamping to 255 if necessary. Set alpha components to 0xFF. |
634 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), | 710 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), |
635 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); | 711 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); |
636 } | 712 } |
637 | 713 |
638 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[], | 714 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], |
639 SkColor color, int width, SkPMColor) { | 715 SkColor src, int width, SkPMColor) { |
640 if (width <= 0) { | 716 if (width <= 0) { |
641 return; | 717 return; |
642 } | 718 } |
643 | 719 |
644 int srcA = SkColorGetA(color); | 720 int srcA = SkColorGetA(src); |
645 int srcR = SkColorGetR(color); | 721 int srcR = SkColorGetR(src); |
646 int srcG = SkColorGetG(color); | 722 int srcG = SkColorGetG(src); |
647 int srcB = SkColorGetB(color); | 723 int srcB = SkColorGetB(src); |
648 | 724 |
649 srcA = SkAlpha255To256(srcA); | 725 srcA = SkAlpha255To256(srcA); |
650 | 726 |
651 if (width >= 4) { | 727 if (width >= 4) { |
652 SkASSERT(((size_t)dst & 0x03) == 0); | 728 SkASSERT(((size_t)dst & 0x03) == 0); |
653 while (((size_t)dst & 0x0F) != 0) { | 729 while (((size_t)dst & 0x0F) != 0) { |
654 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src); | 730 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); |
655 src++; | 731 mask++; |
656 dst++; | 732 dst++; |
657 width--; | 733 width--; |
658 } | 734 } |
659 | 735 |
660 __m128i *d = reinterpret_cast<__m128i*>(dst); | 736 __m128i *d = reinterpret_cast<__m128i*>(dst); |
661 __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); | 737 // Set alpha to 0xFF and replicate source four times in SSE register. |
662 srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128()); | 738 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); |
663 __m128i scale = _mm_set1_epi16(srcA); | 739 // Interleave with zeros to get two sets of four 16-bit values. |
| 740 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); |
| 741 // Set srcA_sse to contain eight copies of srcA, padded with zero. |
| 742 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) |
| 743 __m128i srcA_sse = _mm_set1_epi16(srcA); |
664 while (width >= 4) { | 744 while (width >= 4) { |
665 __m128i dst_pixel = _mm_load_si128(d); | 745 // Load four destination pixels into dst_sse. |
666 __m128i mask_pixel = _mm_loadl_epi64( | 746 __m128i dst_sse = _mm_load_si128(d); |
667 reinterpret_cast<const __m128i*>(src)); | 747 // Load four 16-bit masks into lower half of mask_sse. |
668 | 748 __m128i mask_sse = _mm_loadl_epi64( |
669 // Check whether mask_pixels are equal to 0 and get the highest bit | 749 reinterpret_cast<const __m128i*>(mask)); |
670 // of each byte of result, if mask pixes are all zero, we will get | 750 |
| 751 // Check whether masks are equal to 0 and get the highest bit |
| 752 // of each byte of result, if masks are all zero, we will get |
671 // pack_cmp to 0xFFFF | 753 // pack_cmp to 0xFFFF |
672 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel, | 754 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, |
673 _mm_setzero_si128())); | 755 _mm_setzero_si128())); |
674 | 756 |
675 // if mask pixels are not all zero, we will blend the dst pixels | 757 // if mask pixels are not all zero, we will blend the dst pixels |
676 if (pack_cmp != 0xFFFF) { | 758 if (pack_cmp != 0xFFFF) { |
677 // Unpack 4 16bit mask pixels to | 759 // Unpack 4 16bit mask pixels to |
678 // (p0, 0, p1, 0, p2, 0, p3, 0) | 760 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
679 mask_pixel = _mm_unpacklo_epi16(mask_pixel, | 761 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
680 _mm_setzero_si128()); | 762 mask_sse = _mm_unpacklo_epi16(mask_sse, |
| 763 _mm_setzero_si128()); |
681 | 764 |
682 // Process 4 32bit dst pixels | 765 // Process 4 32bit dst pixels |
683 __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel, | 766 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, |
684 mask_pixel, scale); | 767 mask_sse, srcA_sse); |
685 _mm_store_si128(d, result); | 768 _mm_store_si128(d, result); |
686 } | 769 } |
687 | 770 |
688 d++; | 771 d++; |
689 src += 4; | 772 mask += 4; |
690 width -= 4; | 773 width -= 4; |
691 } | 774 } |
692 | 775 |
693 dst = reinterpret_cast<SkPMColor*>(d); | 776 dst = reinterpret_cast<SkPMColor*>(d); |
694 } | 777 } |
695 | 778 |
696 while (width > 0) { | 779 while (width > 0) { |
697 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src); | 780 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); |
698 src++; | 781 mask++; |
699 dst++; | 782 dst++; |
700 width--; | 783 width--; |
701 } | 784 } |
702 } | 785 } |
703 | 786 |
704 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[], | 787 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], |
705 SkColor color, int width, SkPMColor opaqueDst) { | 788 SkColor src, int width, SkPMColor opaqueDst) { |
706 if (width <= 0) { | 789 if (width <= 0) { |
707 return; | 790 return; |
708 } | 791 } |
709 | 792 |
710 int srcR = SkColorGetR(color); | 793 int srcR = SkColorGetR(src); |
711 int srcG = SkColorGetG(color); | 794 int srcG = SkColorGetG(src); |
712 int srcB = SkColorGetB(color); | 795 int srcB = SkColorGetB(src); |
713 | 796 |
714 if (width >= 4) { | 797 if (width >= 4) { |
715 SkASSERT(((size_t)dst & 0x03) == 0); | 798 SkASSERT(((size_t)dst & 0x03) == 0); |
716 while (((size_t)dst & 0x0F) != 0) { | 799 while (((size_t)dst & 0x0F) != 0) { |
717 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst); | 800 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); |
718 src++; | 801 mask++; |
719 dst++; | 802 dst++; |
720 width--; | 803 width--; |
721 } | 804 } |
722 | 805 |
723 __m128i *d = reinterpret_cast<__m128i*>(dst); | 806 __m128i *d = reinterpret_cast<__m128i*>(dst); |
724 __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); | 807 // Set alpha to 0xFF and replicate source four times in SSE register. |
725 srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128()); | 808 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); |
| 809 // Set srcA_sse to contain eight copies of srcA, padded with zero. |
| 810 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) |
| 811 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); |
726 while (width >= 4) { | 812 while (width >= 4) { |
727 __m128i dst_pixel = _mm_load_si128(d); | 813 // Load four destination pixels into dst_sse. |
728 __m128i mask_pixel = _mm_loadl_epi64( | 814 __m128i dst_sse = _mm_load_si128(d); |
729 reinterpret_cast<const __m128i*>(src)); | 815 // Load four 16-bit masks into lower half of mask_sse. |
| 816 __m128i mask_sse = _mm_loadl_epi64( |
| 817 reinterpret_cast<const __m128i*>(mask)); |
730 | 818 |
731 // Check whether mask_pixels are equal to 0 and get the highest bit | 819 // Check whether masks are equal to 0 and get the highest bit |
732 // of each byte of result, if mask pixes are all zero, we will get | 820 // of each byte of result, if masks are all zero, we will get |
733 // pack_cmp to 0xFFFF | 821 // pack_cmp to 0xFFFF |
734 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel, | 822 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, |
735 _mm_setzero_si128())); | 823 _mm_setzero_si128())); |
736 | 824 |
737 // if mask pixels are not all zero, we will blend the dst pixels | 825 // if mask pixels are not all zero, we will blend the dst pixels |
738 if (pack_cmp != 0xFFFF) { | 826 if (pack_cmp != 0xFFFF) { |
739 // Unpack 4 16bit mask pixels to | 827 // Unpack 4 16bit mask pixels to |
740 // (p0, 0, p1, 0, p2, 0, p3, 0) | 828 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
741 mask_pixel = _mm_unpacklo_epi16(mask_pixel, | 829 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
742 _mm_setzero_si128()); | 830 mask_sse = _mm_unpacklo_epi16(mask_sse, |
| 831 _mm_setzero_si128()); |
743 | 832 |
744 // Process 4 32bit dst pixels | 833 // Process 4 32bit dst pixels |
745 __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel, | 834 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, |
746 mask_pixel); | 835 mask_sse); |
747 _mm_store_si128(d, result); | 836 _mm_store_si128(d, result); |
748 } | 837 } |
749 | 838 |
750 d++; | 839 d++; |
751 src += 4; | 840 mask += 4; |
752 width -= 4; | 841 width -= 4; |
753 } | 842 } |
754 | 843 |
755 dst = reinterpret_cast<SkPMColor*>(d); | 844 dst = reinterpret_cast<SkPMColor*>(d); |
756 } | 845 } |
757 | 846 |
758 while (width > 0) { | 847 while (width > 0) { |
759 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst); | 848 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); |
760 src++; | 849 mask++; |
761 dst++; | 850 dst++; |
762 width--; | 851 width--; |
763 } | 852 } |
764 } | 853 } |
OLD | NEW |