Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(13)

Side by Side Diff: src/opts/SkBlitRow_opts_SSE2.cpp

Issue 17847010: Commented SSE blend functions and cleaned-up variable naming. (Closed) Base URL: https://skia.googlecode.com/svn/trunk
Patch Set: Fixed missing variable rename. Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « include/core/SkColorPriv.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 8
9 #include "SkBlitRow_opts_SSE2.h" 9 #include "SkBlitRow_opts_SSE2.h"
10 #include "SkBitmapProcState_opts_SSE2.h" 10 #include "SkBitmapProcState_opts_SSE2.h"
(...skipping 526 matching lines...) Expand 10 before | Expand all | Expand 10 after
537 #endif 537 #endif
538 538
539 #if SK_B16x5_B32x5_SHIFT == 0 539 #if SK_B16x5_B32x5_SHIFT == 0
540 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) 540 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
541 #elif SK_B16x5_B32x5_SHIFT > 0 541 #elif SK_B16x5_B32x5_SHIFT > 0
542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32 x5_SHIFT)) 542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32 x5_SHIFT))
543 #else 543 #else
544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B3 2x5_SHIFT)) 544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B3 2x5_SHIFT))
545 #endif 545 #endif
546 546
547 static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst, 547 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
548 __m128i &mask, __m128i &scale) { 548 __m128i &mask, __m128i &srcA) {
549 // In the following comments, the components of src, dst and mask are
550 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
551 // by an R, G, B, or A suffix. Components of one of the four pixels that
552 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
553 // example is the blue channel of the second destination pixel. Memory
554 // layout is shown for an ARGB byte order in a color value.
555
556 // src and srcA store 8-bit values interleaved with zeros.
557 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
558 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
559 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
560 // mask stores 16-bit values (compressed three channels) interleaved with ze ros.
561 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
562 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
563 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
564
549 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 565 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
566 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
550 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 567 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
551 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 568 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
552 569
570 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
553 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 571 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
554 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 572 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
555 573
574 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
556 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 575 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
557 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 576 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
558 577
559 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 578 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
579 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
580 // 8-bit position
581 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
582 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
560 mask = _mm_or_si128(_mm_or_si128(r, g), b); 583 mask = _mm_or_si128(_mm_or_si128(r, g), b);
561 584
562 // Interleave R,G,B into the lower byte of word. 585 // Interleave R,G,B into the lower byte of word.
586 // i.e. split the sixteen 8-bit values from mask into two sets of eight
587 // 16-bit values, padded by zero.
563 __m128i maskLo, maskHi; 588 __m128i maskLo, maskHi;
589 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
564 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 590 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
591 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
565 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 592 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
566 593
567 // Upscale to 0..32 594 // Upscale from 0..31 to 0..32
595 // (allows to replace division by left-shift further down)
596 // Left-shift each component by 4 and add the result back to that component,
597 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
568 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 598 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
569 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 599 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
570 600
571 maskLo = _mm_mullo_epi16(maskLo, scale); 601 // Multiply each component of maskLo and maskHi by srcA
572 maskHi = _mm_mullo_epi16(maskHi, scale); 602 maskLo = _mm_mullo_epi16(maskLo, srcA);
573 603 maskHi = _mm_mullo_epi16(maskHi, srcA);
604
605 // Left shift mask components by 8 (divide by 256)
574 maskLo = _mm_srli_epi16(maskLo, 8); 606 maskLo = _mm_srli_epi16(maskLo, 8);
575 maskHi = _mm_srli_epi16(maskHi, 8); 607 maskHi = _mm_srli_epi16(maskHi, 8);
576 608
577 // Interleave R,G,B into the lower byte of the word. 609 // Interleave R,G,B into the lower byte of the word
610 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
578 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 611 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
612 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
579 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 613 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
580 614
581 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo)); 615 // mask = (src - dst) * mask
582 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi)); 616 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
583 617 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
618
619 // mask = (src - dst) * mask >> 5
584 maskLo = _mm_srai_epi16(maskLo, 5); 620 maskLo = _mm_srai_epi16(maskLo, 5);
585 maskHi = _mm_srai_epi16(maskHi, 5); 621 maskHi = _mm_srai_epi16(maskHi, 5);
586 622
587 // Add two pixels into result. 623 // Add two pixels into result.
624 // result = dst + ((src - dst) * mask >> 5)
588 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 625 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
589 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 626 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
590 627
591 // Pack into 4 32bit dst pixels 628 // Pack into 4 32bit dst pixels.
629 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
630 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
631 // clamping to 255 if necessary.
592 return _mm_packus_epi16(resultLo, resultHi); 632 return _mm_packus_epi16(resultLo, resultHi);
593 } 633 }
594 634
595 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst, 635 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
596 __m128i &mask) { 636 __m128i &mask) {
637 // In the following comments, the components of src, dst and mask are
638 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
639 // by an R, G, B, or A suffix. Components of one of the four pixels that
640 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
641 // example is the blue channel of the second destination pixel. Memory
642 // layout is shown for an ARGB byte order in a color value.
643
644 // src and srcA store 8-bit values interleaved with zeros.
645 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
646 // mask stores 16-bit values (shown as high and low bytes) interleaved with
647 // zeros
648 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
649 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
650
597 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 651 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
652 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
598 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 653 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
599 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 654 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
600 655
656 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
601 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 657 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
602 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 658 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
603 659
660 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
604 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 661 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
605 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 662 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
606 663
607 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 664 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
665 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
666 // 8-bit position
667 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
668 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
608 mask = _mm_or_si128(_mm_or_si128(r, g), b); 669 mask = _mm_or_si128(_mm_or_si128(r, g), b);
609 670
610 // Interleave R,G,B into the lower byte of word. 671 // Interleave R,G,B into the lower byte of word.
672 // i.e. split the sixteen 8-bit values from mask into two sets of eight
673 // 16-bit values, padded by zero.
611 __m128i maskLo, maskHi; 674 __m128i maskLo, maskHi;
675 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
612 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 676 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
677 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
613 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 678 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
614 679
615 // Upscale to 0..32 680 // Upscale from 0..31 to 0..32
681 // (allows to replace division by left-shift further down)
682 // Left-shift each component by 4 and add the result back to that component,
683 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
616 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 684 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
617 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 685 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
618 686
619 // Interleave R,G,B into the lower byte of the word. 687 // Interleave R,G,B into the lower byte of the word
688 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
620 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 689 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
690 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
621 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 691 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
622 692
623 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo)); 693 // mask = (src - dst) * mask
624 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi)); 694 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
625 695 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
696
697 // mask = (src - dst) * mask >> 5
626 maskLo = _mm_srai_epi16(maskLo, 5); 698 maskLo = _mm_srai_epi16(maskLo, 5);
627 maskHi = _mm_srai_epi16(maskHi, 5); 699 maskHi = _mm_srai_epi16(maskHi, 5);
628 700
629 // Add two pixels into result. 701 // Add two pixels into result.
702 // result = dst + ((src - dst) * mask >> 5)
630 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 703 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
631 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 704 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
632 705
633 // Pack into 4 32bit dst pixels and force opaque. 706 // Pack into 4 32bit dst pixels and force opaque.
707 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
708 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
709 // clamping to 255 if necessary. Set alpha components to 0xFF.
634 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), 710 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
635 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); 711 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
636 } 712 }
637 713
638 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[], 714 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
639 SkColor color, int width, SkPMColor) { 715 SkColor src, int width, SkPMColor) {
640 if (width <= 0) { 716 if (width <= 0) {
641 return; 717 return;
642 } 718 }
643 719
644 int srcA = SkColorGetA(color); 720 int srcA = SkColorGetA(src);
645 int srcR = SkColorGetR(color); 721 int srcR = SkColorGetR(src);
646 int srcG = SkColorGetG(color); 722 int srcG = SkColorGetG(src);
647 int srcB = SkColorGetB(color); 723 int srcB = SkColorGetB(src);
648 724
649 srcA = SkAlpha255To256(srcA); 725 srcA = SkAlpha255To256(srcA);
650 726
651 if (width >= 4) { 727 if (width >= 4) {
652 SkASSERT(((size_t)dst & 0x03) == 0); 728 SkASSERT(((size_t)dst & 0x03) == 0);
653 while (((size_t)dst & 0x0F) != 0) { 729 while (((size_t)dst & 0x0F) != 0) {
654 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src); 730 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
655 src++; 731 mask++;
656 dst++; 732 dst++;
657 width--; 733 width--;
658 } 734 }
659 735
660 __m128i *d = reinterpret_cast<__m128i*>(dst); 736 __m128i *d = reinterpret_cast<__m128i*>(dst);
661 __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 737 // Set alpha to 0xFF and replicate source four times in SSE register.
662 srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128()); 738 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
663 __m128i scale = _mm_set1_epi16(srcA); 739 // Interleave with zeros to get two sets of four 16-bit values.
740 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
741 // Set srcA_sse to contain eight copies of srcA, padded with zero.
742 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
743 __m128i srcA_sse = _mm_set1_epi16(srcA);
664 while (width >= 4) { 744 while (width >= 4) {
665 __m128i dst_pixel = _mm_load_si128(d); 745 // Load four destination pixels into dst_sse.
666 __m128i mask_pixel = _mm_loadl_epi64( 746 __m128i dst_sse = _mm_load_si128(d);
667 reinterpret_cast<const __m128i*>(src)); 747 // Load four 16-bit masks into lower half of mask_sse.
668 748 __m128i mask_sse = _mm_loadl_epi64(
669 // Check whether mask_pixels are equal to 0 and get the highest bit 749 reinterpret_cast<const __m128i*>(mask));
670 // of each byte of result, if mask pixes are all zero, we will get 750
751 // Check whether masks are equal to 0 and get the highest bit
752 // of each byte of result, if masks are all zero, we will get
671 // pack_cmp to 0xFFFF 753 // pack_cmp to 0xFFFF
672 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel, 754 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
673 _mm_setzero_si128())); 755 _mm_setzero_si128()));
674 756
675 // if mask pixels are not all zero, we will blend the dst pixels 757 // if mask pixels are not all zero, we will blend the dst pixels
676 if (pack_cmp != 0xFFFF) { 758 if (pack_cmp != 0xFFFF) {
677 // Unpack 4 16bit mask pixels to 759 // Unpack 4 16bit mask pixels to
678 // (p0, 0, p1, 0, p2, 0, p3, 0) 760 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
679 mask_pixel = _mm_unpacklo_epi16(mask_pixel, 761 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
680 _mm_setzero_si128()); 762 mask_sse = _mm_unpacklo_epi16(mask_sse,
763 _mm_setzero_si128());
681 764
682 // Process 4 32bit dst pixels 765 // Process 4 32bit dst pixels
683 __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel, 766 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
684 mask_pixel, scale); 767 mask_sse, srcA_sse);
685 _mm_store_si128(d, result); 768 _mm_store_si128(d, result);
686 } 769 }
687 770
688 d++; 771 d++;
689 src += 4; 772 mask += 4;
690 width -= 4; 773 width -= 4;
691 } 774 }
692 775
693 dst = reinterpret_cast<SkPMColor*>(d); 776 dst = reinterpret_cast<SkPMColor*>(d);
694 } 777 }
695 778
696 while (width > 0) { 779 while (width > 0) {
697 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src); 780 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
698 src++; 781 mask++;
699 dst++; 782 dst++;
700 width--; 783 width--;
701 } 784 }
702 } 785 }
703 786
704 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[], 787 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
705 SkColor color, int width, SkPMColor opaqueDst) { 788 SkColor src, int width, SkPMColor opaqueDst) {
706 if (width <= 0) { 789 if (width <= 0) {
707 return; 790 return;
708 } 791 }
709 792
710 int srcR = SkColorGetR(color); 793 int srcR = SkColorGetR(src);
711 int srcG = SkColorGetG(color); 794 int srcG = SkColorGetG(src);
712 int srcB = SkColorGetB(color); 795 int srcB = SkColorGetB(src);
713 796
714 if (width >= 4) { 797 if (width >= 4) {
715 SkASSERT(((size_t)dst & 0x03) == 0); 798 SkASSERT(((size_t)dst & 0x03) == 0);
716 while (((size_t)dst & 0x0F) != 0) { 799 while (((size_t)dst & 0x0F) != 0) {
717 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst); 800 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
718 src++; 801 mask++;
719 dst++; 802 dst++;
720 width--; 803 width--;
721 } 804 }
722 805
723 __m128i *d = reinterpret_cast<__m128i*>(dst); 806 __m128i *d = reinterpret_cast<__m128i*>(dst);
724 __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 807 // Set alpha to 0xFF and replicate source four times in SSE register.
725 srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128()); 808 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
809 // Set srcA_sse to contain eight copies of srcA, padded with zero.
810 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
811 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
726 while (width >= 4) { 812 while (width >= 4) {
727 __m128i dst_pixel = _mm_load_si128(d); 813 // Load four destination pixels into dst_sse.
728 __m128i mask_pixel = _mm_loadl_epi64( 814 __m128i dst_sse = _mm_load_si128(d);
729 reinterpret_cast<const __m128i*>(src)); 815 // Load four 16-bit masks into lower half of mask_sse.
816 __m128i mask_sse = _mm_loadl_epi64(
817 reinterpret_cast<const __m128i*>(mask));
730 818
731 // Check whether mask_pixels are equal to 0 and get the highest bit 819 // Check whether masks are equal to 0 and get the highest bit
732 // of each byte of result, if mask pixes are all zero, we will get 820 // of each byte of result, if masks are all zero, we will get
733 // pack_cmp to 0xFFFF 821 // pack_cmp to 0xFFFF
734 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel, 822 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
735 _mm_setzero_si128())); 823 _mm_setzero_si128()));
736 824
737 // if mask pixels are not all zero, we will blend the dst pixels 825 // if mask pixels are not all zero, we will blend the dst pixels
738 if (pack_cmp != 0xFFFF) { 826 if (pack_cmp != 0xFFFF) {
739 // Unpack 4 16bit mask pixels to 827 // Unpack 4 16bit mask pixels to
740 // (p0, 0, p1, 0, p2, 0, p3, 0) 828 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
741 mask_pixel = _mm_unpacklo_epi16(mask_pixel, 829 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
742 _mm_setzero_si128()); 830 mask_sse = _mm_unpacklo_epi16(mask_sse,
831 _mm_setzero_si128());
743 832
744 // Process 4 32bit dst pixels 833 // Process 4 32bit dst pixels
745 __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel, 834 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
746 mask_pixel); 835 mask_sse);
747 _mm_store_si128(d, result); 836 _mm_store_si128(d, result);
748 } 837 }
749 838
750 d++; 839 d++;
751 src += 4; 840 mask += 4;
752 width -= 4; 841 width -= 4;
753 } 842 }
754 843
755 dst = reinterpret_cast<SkPMColor*>(d); 844 dst = reinterpret_cast<SkPMColor*>(d);
756 } 845 }
757 846
758 while (width > 0) { 847 while (width > 0) {
759 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst); 848 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
760 src++; 849 mask++;
761 dst++; 850 dst++;
762 width--; 851 width--;
763 } 852 }
764 } 853 }
OLDNEW
« no previous file with comments | « include/core/SkColorPriv.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698