src/effects/SkBlurMask.cpp - Issue 16750002: Remove the non-separable blur because it is no longer used.

Unified Diff: src/effects/SkBlurMask.cpp

Issue 16750002: Remove the non-separable blur because it is no longer used. (Closed) Base URL: http://skia.googlecode.com/svn/trunk/

Patch Set: Created 7 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/effects/SkBlurMask.cpp

===================================================================

--- src/effects/SkBlurMask.cpp (revision 9493)

+++ src/effects/SkBlurMask.cpp (working copy)

@@ -412,482 +412,6 @@

}

-// Unrolling the integer blur kernel seems to give us a ~15% speedup on Windows,

-// breakeven on Mac, and ~15% slowdown on Linux.

-// Reading a word at a time when bulding the sum buffer seems to give

-// us no appreciable speedup on Windows or Mac, and 2% slowdown on Linux.

-#if defined(SK_BUILD_FOR_WIN32)

-#define UNROLL_KERNEL_LOOP 1

-#endif

-/** The sum buffer is an array of u32 to hold the accumulated sum of all of the

- src values at their position, plus all values above and to the left.

- When we sample into this buffer, we need an initial row and column of 0s,

- so we have an index correspondence as follows:

- src[i, j] == sum[i+1, j+1]

- sum[0, j] == sum[i, 0] == 0

- We assume that the sum buffer's stride == its width

- */

-static void build_sum_buffer(uint32_t sum[], int srcW, int srcH,

- const uint8_t src[], int srcRB) {

- int sumW = srcW + 1;

- SkASSERT(srcRB >= srcW);

- // mod srcRB so we can apply it after each row

- srcRB -= srcW;

- int x, y;

- // zero out the top row and column

- memset(sum, 0, sumW * sizeof(sum[0]));

- sum += sumW;

- // special case first row

- uint32_t X = 0;

- *sum++ = 0; // initialze the first column to 0

- for (x = srcW - 1; x >= 0; --x) {

- X = *src++ + X;

- *sum++ = X;

- }

- src += srcRB;

- // now do the rest of the rows

- for (y = srcH - 1; y > 0; --y) {

- uint32_t L = 0;

- uint32_t C = 0;

- *sum++ = 0; // initialze the first column to 0

- for (x = srcW - 1; !SkIsAlign4((intptr_t) src) && x >= 0; x--) {

- uint32_t T = sum[-sumW];

- X = *src++ + L + T - C;

- *sum++ = X;

- L = X;

- C = T;

- }

- for (; x >= 4; x-=4) {

- uint32_t T = sum[-sumW];

- X = *src++ + L + T - C;

- *sum++ = X;

- L = X;

- C = T;

- T = sum[-sumW];

- X = *src++ + L + T - C;

- *sum++ = X;

- L = X;

- C = T;

- T = sum[-sumW];

- X = *src++ + L + T - C;

- *sum++ = X;

- L = X;

- C = T;

- T = sum[-sumW];

- X = *src++ + L + T - C;

- *sum++ = X;

- L = X;

- C = T;

- }

- for (; x >= 0; --x) {

- uint32_t T = sum[-sumW];

- X = *src++ + L + T - C;

- *sum++ = X;

- L = X;

- C = T;

- }

- src += srcRB;

- }

-/**

- * This is the path for apply_kernel() to be taken when the kernel

- * is wider than the source image.

- */

-static void kernel_clamped(uint8_t dst[], int rx, int ry, const uint32_t sum[],

- int sw, int sh) {

- SkASSERT(2*rx > sw);

- uint32_t scale = (1 << 24) / ((2*rx + 1)*(2*ry + 1));

- int sumStride = sw + 1;

- int dw = sw + 2*rx;

- int dh = sh + 2*ry;

- int prev_y = -2*ry;

- int next_y = 1;

- for (int y = 0; y < dh; ++y) {

- int py = SkClampPos(prev_y) * sumStride;

- int ny = SkFastMin32(next_y, sh) * sumStride;

- int prev_x = -2*rx;

- int next_x = 1;

- for (int x = 0; x < dw; ++x) {

- int px = SkClampPos(prev_x);

- int nx = SkFastMin32(next_x, sw);

- // TODO: should we be adding 1/2 (1 << 23) to round to the

- // nearest integer here?

- uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny];

- *dst++ = SkToU8(tmp * scale >> 24);

- prev_x += 1;

- next_x += 1;

- }

- prev_y += 1;

- next_y += 1;

- }

-/**

- * sw and sh are the width and height of the src. Since the sum buffer

- * matches that, but has an extra row and col at the beginning (with zeros),

- * we can just use sw and sh as our "max" values for pinning coordinates

- * when sampling into sum[][]

- *

- * The inner loop is conceptually simple; we break it into several sections

- * to improve performance. Here's the original version:

- for (int x = 0; x < dw; ++x) {

- int px = SkClampPos(prev_x);

- int nx = SkFastMin32(next_x, sw);

- uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny];

- *dst++ = SkToU8(tmp * scale >> 24);

- prev_x += 1;

- next_x += 1;

- }

- * The sections are:

- * left-hand section, where prev_x is clamped to 0

- * center section, where neither prev_x nor next_x is clamped

- * right-hand section, where next_x is clamped to sw

- * On some operating systems, the center section is unrolled for additional

- * speedup.

-*/

-static void apply_kernel(uint8_t dst[], int rx, int ry, const uint32_t sum[],

- int sw, int sh) {

- if (2*rx > sw) {

- kernel_clamped(dst, rx, ry, sum, sw, sh);

- return;

- }

- uint32_t scale = (1 << 24) / ((2*rx + 1)*(2*ry + 1));

- int sumStride = sw + 1;

- int dw = sw + 2*rx;

- int dh = sh + 2*ry;

- int prev_y = -2*ry;

- int next_y = 1;

- SkASSERT(2*rx <= dw - 2*rx);

- for (int y = 0; y < dh; ++y) {

- int py = SkClampPos(prev_y) * sumStride;

- int ny = SkFastMin32(next_y, sh) * sumStride;

- int prev_x = -2*rx;

- int next_x = 1;

- int x = 0;

- for (; x < 2*rx; ++x) {

- SkASSERT(prev_x <= 0);

- SkASSERT(next_x <= sw);

- int px = 0;

- int nx = next_x;

- uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny];

- *dst++ = SkToU8(tmp * scale >> 24);

- prev_x += 1;

- next_x += 1;

- }

- int i0 = prev_x + py;

- int i1 = next_x + ny;

- int i2 = next_x + py;

- int i3 = prev_x + ny;

-#if UNROLL_KERNEL_LOOP

- for (; x < dw - 2*rx - 4; x += 4) {

- SkASSERT(prev_x >= 0);

- SkASSERT(next_x <= sw);

- uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];

- *dst++ = SkToU8(tmp * scale >> 24);

- tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];

- *dst++ = SkToU8(tmp * scale >> 24);

- tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];

- *dst++ = SkToU8(tmp * scale >> 24);

- tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];

- *dst++ = SkToU8(tmp * scale >> 24);

- prev_x += 4;

- next_x += 4;

- }

-#endif

- for (; x < dw - 2*rx; ++x) {

- SkASSERT(prev_x >= 0);

- SkASSERT(next_x <= sw);

- uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];

- *dst++ = SkToU8(tmp * scale >> 24);

- prev_x += 1;

- next_x += 1;

- }

- for (; x < dw; ++x) {

- SkASSERT(prev_x >= 0);

- SkASSERT(next_x > sw);

- int px = prev_x;

- int nx = sw;

- uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny];

- *dst++ = SkToU8(tmp * scale >> 24);

- prev_x += 1;

- next_x += 1;

- }

- prev_y += 1;

- next_y += 1;

- }

-/**

- * This is the path for apply_kernel_interp() to be taken when the kernel

- * is wider than the source image.

- */

-static void kernel_interp_clamped(uint8_t dst[], int rx, int ry,

- const uint32_t sum[], int sw, int sh, U8CPU outerWeight) {

- SkASSERT(2*rx > sw);

- int innerWeight = 255 - outerWeight;

- // round these guys up if they're bigger than 127

- outerWeight += outerWeight >> 7;

- innerWeight += innerWeight >> 7;

- uint32_t outerScale = (outerWeight << 16) / ((2*rx + 1)*(2*ry + 1));

- uint32_t innerScale = (innerWeight << 16) / ((2*rx - 1)*(2*ry - 1));

- int sumStride = sw + 1;

- int dw = sw + 2*rx;

- int dh = sh + 2*ry;

- int prev_y = -2*ry;

- int next_y = 1;

- for (int y = 0; y < dh; ++y) {

- int py = SkClampPos(prev_y) * sumStride;

- int ny = SkFastMin32(next_y, sh) * sumStride;

- int ipy = SkClampPos(prev_y + 1) * sumStride;

- int iny = SkClampMax(next_y - 1, sh) * sumStride;

- int prev_x = -2*rx;

- int next_x = 1;

- for (int x = 0; x < dw; ++x) {

- int px = SkClampPos(prev_x);

- int nx = SkFastMin32(next_x, sw);

- int ipx = SkClampPos(prev_x + 1);

- int inx = SkClampMax(next_x - 1, sw);

- uint32_t outerSum = sum[px+py] + sum[nx+ny]

- - sum[nx+py] - sum[px+ny];

- uint32_t innerSum = sum[ipx+ipy] + sum[inx+iny]

- - sum[inx+ipy] - sum[ipx+iny];

- *dst++ = SkToU8((outerSum * outerScale

- + innerSum * innerScale) >> 24);

- prev_x += 1;

- next_x += 1;

- }

- prev_y += 1;

- next_y += 1;

- }

-/**

- * sw and sh are the width and height of the src. Since the sum buffer

- * matches that, but has an extra row and col at the beginning (with zeros),

- * we can just use sw and sh as our "max" values for pinning coordinates

- * when sampling into sum[][]

- *

- * The inner loop is conceptually simple; we break it into several variants

- * to improve performance. Here's the original version:

- for (int x = 0; x < dw; ++x) {

- int px = SkClampPos(prev_x);

- int nx = SkFastMin32(next_x, sw);

- int ipx = SkClampPos(prev_x + 1);

- int inx = SkClampMax(next_x - 1, sw);

- uint32_t outerSum = sum[px+py] + sum[nx+ny]

- - sum[nx+py] - sum[px+ny];

- uint32_t innerSum = sum[ipx+ipy] + sum[inx+iny]

- - sum[inx+ipy] - sum[ipx+iny];

- *dst++ = SkToU8((outerSum * outerScale

- + innerSum * innerScale) >> 24);

- prev_x += 1;

- next_x += 1;

- }

- * The sections are:

- * left-hand section, where prev_x is clamped to 0

- * center section, where neither prev_x nor next_x is clamped

- * right-hand section, where next_x is clamped to sw

- * On some operating systems, the center section is unrolled for additional

- * speedup.

-*/

-static void apply_kernel_interp(uint8_t dst[], int rx, int ry,

- const uint32_t sum[], int sw, int sh, U8CPU outerWeight) {

- SkASSERT(rx > 0 && ry > 0);

- SkASSERT(outerWeight <= 255);

- if (2*rx > sw) {

- kernel_interp_clamped(dst, rx, ry, sum, sw, sh, outerWeight);

- return;

- }

- int innerWeight = 255 - outerWeight;

- // round these guys up if they're bigger than 127

- outerWeight += outerWeight >> 7;

- innerWeight += innerWeight >> 7;

- uint32_t outerScale = (outerWeight << 16) / ((2*rx + 1)*(2*ry + 1));

- uint32_t innerScale = (innerWeight << 16) / ((2*rx - 1)*(2*ry - 1));

- int sumStride = sw + 1;

- int dw = sw + 2*rx;

- int dh = sh + 2*ry;

- int prev_y = -2*ry;

- int next_y = 1;

- SkASSERT(2*rx <= dw - 2*rx);

- for (int y = 0; y < dh; ++y) {

- int py = SkClampPos(prev_y) * sumStride;

- int ny = SkFastMin32(next_y, sh) * sumStride;

- int ipy = SkClampPos(prev_y + 1) * sumStride;

- int iny = SkClampMax(next_y - 1, sh) * sumStride;

- int prev_x = -2*rx;

- int next_x = 1;

- int x = 0;

- for (; x < 2*rx; ++x) {

- SkASSERT(prev_x < 0);

- SkASSERT(next_x <= sw);

- int px = 0;

- int nx = next_x;

- int ipx = 0;

- int inx = next_x - 1;

- uint32_t outerSum = sum[px+py] + sum[nx+ny]

- - sum[nx+py] - sum[px+ny];

- uint32_t innerSum = sum[ipx+ipy] + sum[inx+iny]

- - sum[inx+ipy] - sum[ipx+iny];

- *dst++ = SkToU8((outerSum * outerScale

- + innerSum * innerScale) >> 24);

- prev_x += 1;

- next_x += 1;

- }

- int i0 = prev_x + py;

- int i1 = next_x + ny;

- int i2 = next_x + py;

- int i3 = prev_x + ny;

- int i4 = prev_x + 1 + ipy;

- int i5 = next_x - 1 + iny;

- int i6 = next_x - 1 + ipy;

- int i7 = prev_x + 1 + iny;

-#if UNROLL_KERNEL_LOOP

- for (; x < dw - 2*rx - 4; x += 4) {

- SkASSERT(prev_x >= 0);

- SkASSERT(next_x <= sw);

- uint32_t outerSum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];

- uint32_t innerSum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];

- *dst++ = SkToU8((outerSum * outerScale

- + innerSum * innerScale) >> 24);

- outerSum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];

- innerSum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];

- *dst++ = SkToU8((outerSum * outerScale

- + innerSum * innerScale) >> 24);

- outerSum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];

- innerSum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];

- *dst++ = SkToU8((outerSum * outerScale

- + innerSum * innerScale) >> 24);

- outerSum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];

- innerSum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];

- *dst++ = SkToU8((outerSum * outerScale

- + innerSum * innerScale) >> 24);

- prev_x += 4;

- next_x += 4;

- }

-#endif

- for (; x < dw - 2*rx; ++x) {

- SkASSERT(prev_x >= 0);

- SkASSERT(next_x <= sw);

- uint32_t outerSum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];

- uint32_t innerSum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];

- *dst++ = SkToU8((outerSum * outerScale

- + innerSum * innerScale) >> 24);

- prev_x += 1;

- next_x += 1;

- }

- for (; x < dw; ++x) {

- SkASSERT(prev_x >= 0);

- SkASSERT(next_x > sw);

- int px = prev_x;

- int nx = sw;

- int ipx = prev_x + 1;

- int inx = sw;

- uint32_t outerSum = sum[px+py] + sum[nx+ny]

- - sum[nx+py] - sum[px+ny];

- uint32_t innerSum = sum[ipx+ipy] + sum[inx+iny]

- - sum[inx+ipy] - sum[ipx+iny];

- *dst++ = SkToU8((outerSum * outerScale

- + innerSum * innerScale) >> 24);

- prev_x += 1;

- next_x += 1;

- }

- prev_y += 1;

- next_y += 1;

- }

#include "SkColorPriv.h"

static void merge_src_with_blur(uint8_t dst[], int dstRB,

@@ -955,7 +479,7 @@

bool SkBlurMask::Blur(SkMask* dst, const SkMask& src,

SkScalar radius, Style style, Quality quality,

- SkIPoint* margin, bool separable)

+ SkIPoint* margin)

{

if (src.fFormat != SkMask::kA8_Format) {

@@ -1011,78 +535,40 @@

SkAutoTCallVProc<uint8_t, SkMask_FreeImage> autoCall(dp);

// build the blurry destination

- if (separable) {

- SkAutoTMalloc<uint8_t> tmpBuffer(dstSize);

- uint8_t* tp = tmpBuffer.get();

- int w = sw, h = sh;

+ SkAutoTMalloc<uint8_t> tmpBuffer(dstSize);

+ uint8_t* tp = tmpBuffer.get();

+ int w = sw, h = sh;

- if (outerWeight == 255) {

- int loRadius, hiRadius;

- get_adjusted_radii(passRadius, &loRadius, &hiRadius);

- if (kHigh_Quality == quality) {

- // Do three X blurs, with a transpose on the final one.

- w = boxBlur(sp, src.fRowBytes, tp, loRadius, hiRadius, w, h, false);

- w = boxBlur(tp, w, dp, hiRadius, loRadius, w, h, false);

- w = boxBlur(dp, w, tp, hiRadius, hiRadius, w, h, true);

- // Do three Y blurs, with a transpose on the final one.

- h = boxBlur(tp, h, dp, loRadius, hiRadius, h, w, false);

- h = boxBlur(dp, h, tp, hiRadius, loRadius, h, w, false);

- h = boxBlur(tp, h, dp, hiRadius, hiRadius, h, w, true);

- } else {

- w = boxBlur(sp, src.fRowBytes, tp, rx, rx, w, h, true);

- h = boxBlur(tp, h, dp, ry, ry, h, w, true);

- }

+ if (outerWeight == 255) {

+ int loRadius, hiRadius;

+ get_adjusted_radii(passRadius, &loRadius, &hiRadius);

+ if (kHigh_Quality == quality) {

+ // Do three X blurs, with a transpose on the final one.

+ w = boxBlur(sp, src.fRowBytes, tp, loRadius, hiRadius, w, h, false);

+ w = boxBlur(tp, w, dp, hiRadius, loRadius, w, h, false);

+ w = boxBlur(dp, w, tp, hiRadius, hiRadius, w, h, true);

+ // Do three Y blurs, with a transpose on the final one.

+ h = boxBlur(tp, h, dp, loRadius, hiRadius, h, w, false);

+ h = boxBlur(dp, h, tp, hiRadius, loRadius, h, w, false);

+ h = boxBlur(tp, h, dp, hiRadius, hiRadius, h, w, true);

} else {

- if (kHigh_Quality == quality) {

- // Do three X blurs, with a transpose on the final one.

- w = boxBlurInterp(sp, src.fRowBytes, tp, rx, w, h, false, outerWeight);

- w = boxBlurInterp(tp, w, dp, rx, w, h, false, outerWeight);

- w = boxBlurInterp(dp, w, tp, rx, w, h, true, outerWeight);

- // Do three Y blurs, with a transpose on the final one.

- h = boxBlurInterp(tp, h, dp, ry, h, w, false, outerWeight);

- h = boxBlurInterp(dp, h, tp, ry, h, w, false, outerWeight);

- h = boxBlurInterp(tp, h, dp, ry, h, w, true, outerWeight);

- } else {

- w = boxBlurInterp(sp, src.fRowBytes, tp, rx, w, h, true, outerWeight);

- h = boxBlurInterp(tp, h, dp, ry, h, w, true, outerWeight);

- }

+ w = boxBlur(sp, src.fRowBytes, tp, rx, rx, w, h, true);

+ h = boxBlur(tp, h, dp, ry, ry, h, w, true);

}

} else {

- const size_t storageW = sw + 2 * (passCount - 1) * rx + 1;

- const size_t storageH = sh + 2 * (passCount - 1) * ry + 1;

- SkAutoTMalloc<uint32_t> storage(storageW * storageH);

- uint32_t* sumBuffer = storage.get();

- //pass1: sp is source, dp is destination

- build_sum_buffer(sumBuffer, sw, sh, sp, src.fRowBytes);

- if (outerWeight == 255) {

- apply_kernel(dp, rx, ry, sumBuffer, sw, sh);

+ if (kHigh_Quality == quality) {

+ // Do three X blurs, with a transpose on the final one.

+ w = boxBlurInterp(sp, src.fRowBytes, tp, rx, w, h, false, outerWeight);

+ w = boxBlurInterp(tp, w, dp, rx, w, h, false, outerWeight);

+ w = boxBlurInterp(dp, w, tp, rx, w, h, true, outerWeight);

+ // Do three Y blurs, with a transpose on the final one.

+ h = boxBlurInterp(tp, h, dp, ry, h, w, false, outerWeight);

+ h = boxBlurInterp(dp, h, tp, ry, h, w, false, outerWeight);

+ h = boxBlurInterp(tp, h, dp, ry, h, w, true, outerWeight);

} else {

- apply_kernel_interp(dp, rx, ry, sumBuffer, sw, sh, outerWeight);

+ w = boxBlurInterp(sp, src.fRowBytes, tp, rx, w, h, true, outerWeight);

+ h = boxBlurInterp(tp, h, dp, ry, h, w, true, outerWeight);

}

- if (kHigh_Quality == quality) {

- //pass2: dp is source, tmpBuffer is destination

- int tmp_sw = sw + 2 * rx;

- int tmp_sh = sh + 2 * ry;

- SkAutoTMalloc<uint8_t> tmpBuffer(dstSize);

- build_sum_buffer(sumBuffer, tmp_sw, tmp_sh, dp, tmp_sw);

- if (outerWeight == 255)

- apply_kernel(tmpBuffer.get(), rx, ry, sumBuffer, tmp_sw, tmp_sh);

- else

- apply_kernel_interp(tmpBuffer.get(), rx, ry, sumBuffer,

- tmp_sw, tmp_sh, outerWeight);

- //pass3: tmpBuffer is source, dp is destination

- tmp_sw += 2 * rx;

- tmp_sh += 2 * ry;

- build_sum_buffer(sumBuffer, tmp_sw, tmp_sh, tmpBuffer.get(), tmp_sw);

- if (outerWeight == 255)

- apply_kernel(dp, rx, ry, sumBuffer, tmp_sw, tmp_sh);

- else

- apply_kernel_interp(dp, rx, ry, sumBuffer, tmp_sw, tmp_sh,

- outerWeight);

- }

}

dst->fImage = dp;

@@ -1115,20 +601,6 @@

return true;

}

-bool SkBlurMask::BlurSeparable(SkMask* dst, const SkMask& src,

- SkScalar radius, Style style, Quality quality,

- SkIPoint* margin)

- return SkBlurMask::Blur(dst, src, radius, style, quality, margin, true);

-bool SkBlurMask::Blur(SkMask* dst, const SkMask& src,

- SkScalar radius, Style style, Quality quality,

- SkIPoint* margin)

- return SkBlurMask::Blur(dst, src, radius, style, quality, margin, false);

/* Convolving a box with itself three times results in a piecewise

quadratic function:

« no previous file with comments | « src/effects/SkBlurMask.h ('k') | src/effects/SkBlurMaskFilter.cpp » ('j') | no next file with comments »