src/opts/SkBlitRow_opts_SSE2.cpp - Issue 17847010: Commented SSE blend functions and cleaned-up variable naming.

Unified Diff: src/opts/SkBlitRow_opts_SSE2.cpp

Issue 17847010: Commented SSE blend functions and cleaned-up variable naming. (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Fixed missing variable rename. Created 7 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/SkBlitRow_opts_SSE2.cpp

diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp

index 27ce1e5f62851bcd2f210f3f692ac74a070a77ba..f3d010e3bc4502f1c54d4cb9b09488c6494a1a98 100644

--- a/src/opts/SkBlitRow_opts_SSE2.cpp

+++ b/src/opts/SkBlitRow_opts_SSE2.cpp

@@ -544,149 +544,232 @@ void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,

#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))

#endif

-static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst,

- __m128i &mask, __m128i &scale) {

+static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,

+ __m128i &mask, __m128i &srcA) {

+ // In the following comments, the components of src, dst and mask are

+ // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked

+ // by an R, G, B, or A suffix. Components of one of the four pixels that

+ // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for

+ // example is the blue channel of the second destination pixel. Memory

+ // layout is shown for an ARGB byte order in a color value.

+ // src and srcA store 8-bit values interleaved with zeros.

+ // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

+ // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,

+ // srcA, 0, srcA, 0, srcA, 0, srcA, 0)

+ // mask stores 16-bit values (compressed three channels) interleaved with zeros.

+ // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.

+ // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

+ // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.

+ // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)

__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),

_mm_set1_epi32(0x1F << SK_R32_SHIFT));

+ // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)

__m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),

_mm_set1_epi32(0x1F << SK_G32_SHIFT));

+ // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)

__m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),

_mm_set1_epi32(0x1F << SK_B32_SHIFT));

// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)

+ // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an

+ // 8-bit position

+ // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,

+ // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)

mask = _mm_or_si128(_mm_or_si128(r, g), b);

// Interleave R,G,B into the lower byte of word.

+ // i.e. split the sixteen 8-bit values from mask into two sets of eight

+ // 16-bit values, padded by zero.

__m128i maskLo, maskHi;

+ // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)

maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());

+ // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)

maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

- // Upscale to 0..32

+ // Upscale from 0..31 to 0..32

+ // (allows to replace division by left-shift further down)

+ // Left-shift each component by 4 and add the result back to that component,

+ // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32

maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));

maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

- maskLo = _mm_mullo_epi16(maskLo, scale);

- maskHi = _mm_mullo_epi16(maskHi, scale);

+ // Multiply each component of maskLo and maskHi by srcA

+ maskLo = _mm_mullo_epi16(maskLo, srcA);

+ maskHi = _mm_mullo_epi16(maskHi, srcA);

+ // Left shift mask components by 8 (divide by 256)

maskLo = _mm_srli_epi16(maskLo, 8);

maskHi = _mm_srli_epi16(maskHi, 8);

- // Interleave R,G,B into the lower byte of the word.

+ // Interleave R,G,B into the lower byte of the word

+ // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)

__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());

+ // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)

__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

- maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));

- maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));

+ // mask = (src - dst) * mask

+ maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));

+ maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

+ // mask = (src - dst) * mask >> 5

maskLo = _mm_srai_epi16(maskLo, 5);

maskHi = _mm_srai_epi16(maskHi, 5);

// Add two pixels into result.

+ // result = dst + ((src - dst) * mask >> 5)

__m128i resultLo = _mm_add_epi16(dstLo, maskLo);

__m128i resultHi = _mm_add_epi16(dstHi, maskHi);

- // Pack into 4 32bit dst pixels

+ // Pack into 4 32bit dst pixels.

+ // resultLo and resultHi contain eight 16-bit components (two pixels) each.

+ // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),

+ // clamping to 255 if necessary.

return _mm_packus_epi16(resultLo, resultHi);

}

-static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst,

+static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,

__m128i &mask) {