/** * This is the path for apply_kernel_interp() to be taken when the kernel * is wider than the source image. */ static void kernel_interp_clamped(uint8_t dst[], int rx, int ry, const uint32_t sum[], int sw, int sh, U8CPU outerWeight) { SkASSERT(2*rx > sw); int innerWeight = 255 - outerWeight; // round these guys up if they're bigger than 127 outerWeight += outerWeight >> 7; innerWeight += innerWeight >> 7; uint32_t outerScale = (outerWeight << 16) / ((2*rx + 1)*(2*ry + 1)); uint32_t innerScale = (innerWeight << 16) / ((2*rx - 1)*(2*ry - 1)); int sumStride = sw + 1; int dw = sw + 2*rx; int dh = sh + 2*ry; int prev_y = -2*ry; int next_y = 1; for (int y = 0; y < dh; ++y) { int py = SkClampPos(prev_y) * sumStride; int ny = SkFastMin32(next_y, sh) * sumStride; int ipy = SkClampPos(prev_y + 1) * sumStride; int iny = SkClampMax(next_y - 1, sh) * sumStride; int prev_x = -2*rx; int next_x = 1; for (int x = 0; x < dw; ++x) { int px = SkClampPos(prev_x); int nx = SkFastMin32(next_x, sw); int ipx = SkClampPos(prev_x + 1); int inx = SkClampMax(next_x - 1, sw); uint32_t outerSum = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; uint32_t innerSum = sum[ipx+ipy] + sum[inx+iny] - sum[inx+ipy] - sum[ipx+iny]; *dst++ = SkToU8((outerSum * outerScale + innerSum * innerScale) >> 24); prev_x += 1; next_x += 1; } prev_y += 1; next_y += 1; } }
/** * This is the path for apply_kernel() to be taken when the kernel * is wider than the source image. */ static void kernel_clamped(uint8_t dst[], int rx, int ry, const uint32_t sum[], int sw, int sh) { SkASSERT(2*rx > sw); uint32_t scale = (1 << 24) / ((2*rx + 1)*(2*ry + 1)); int sumStride = sw + 1; int dw = sw + 2*rx; int dh = sh + 2*ry; int prev_y = -2*ry; int next_y = 1; for (int y = 0; y < dh; ++y) { int py = SkClampPos(prev_y) * sumStride; int ny = SkFastMin32(next_y, sh) * sumStride; int prev_x = -2*rx; int next_x = 1; for (int x = 0; x < dw; ++x) { int px = SkClampPos(prev_x); int nx = SkFastMin32(next_x, sw); // TODO: should we be adding 1/2 (1 << 23) to round to the // nearest integer here? uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; *dst++ = SkToU8(tmp * scale >> 24); prev_x += 1; next_x += 1; } prev_y += 1; next_y += 1; } }
// Perform a brute force convolution of a step function with a Gaussian. // Return the right half in 'result' static void brute_force_1d(SkScalar stepMin, SkScalar stepMax, SkScalar gaussianSigma, int* result, int resultCount) { int gaussianRange = SkScalarCeilToInt(10 * gaussianSigma); for (int i = 0; i < resultCount; ++i) { SkScalar sum = 0.0f; for (int j = -gaussianRange; j < gaussianRange; ++j) { sum += gaussian(j, gaussianSigma) * step(i-j, stepMin, stepMax); } result[i] = SkClampMax(SkClampPos(int(sum + 0.5f)), 255); } }
/** * This is the path for apply_kernel() to be taken when the kernel * is wider than the source image. */ static void kernel_clamped(uint8_t dst[], int rx, int ry, const uint32_t sum[], int sw, int sh) { SkASSERT(2*rx > sw); uint32_t scale = (1 << 24) / ((2*rx + 1)*(2*ry + 1)); int sumStride = sw + 1; int dw = sw + 2*rx; int dh = sh + 2*ry; int prev_y = -2*ry; int next_y = 1; for (int y = 0; y < dh; y++) { int py = SkClampPos(prev_y) * sumStride; int ny = SkFastMin32(next_y, sh) * sumStride; int prev_x = -2*rx; int next_x = 1; for (int x = 0; x < dw; x++) { int px = SkClampPos(prev_x); int nx = SkFastMin32(next_x, sw); uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; *dst++ = SkToU8(tmp * scale >> 24); prev_x += 1; next_x += 1; } prev_y += 1; next_y += 1; } }
bool SkBlurMask::BlurGroundTruth(SkMask* dst, const SkMask& src, SkScalar provided_radius, Style style, SkIPoint* margin) { if (src.fFormat != SkMask::kA8_Format) { return false; } float radius = SkScalarToFloat(SkScalarMul(provided_radius, kBlurRadiusFudgeFactor)); float stddev = SkScalarToFloat(radius) /2.0f; float variance = stddev * stddev; int windowSize = SkScalarCeil(stddev*4); // round window size up to nearest odd number windowSize |= 1; SkAutoTMalloc<float> gaussWindow(windowSize); int halfWindow = windowSize >> 1; gaussWindow[halfWindow] = 1; float windowSum = 1; for (int x = 1 ; x <= halfWindow ; ++x) { float gaussian = expf(-x*x / variance); gaussWindow[halfWindow + x] = gaussWindow[halfWindow-x] = gaussian; windowSum += 2*gaussian; } // leave the filter un-normalized for now; we will divide by the normalization // sum later; int pad = halfWindow; if (margin) { margin->set( pad, pad ); } dst->fBounds = src.fBounds; dst->fBounds.outset(pad, pad); dst->fRowBytes = dst->fBounds.width(); dst->fFormat = SkMask::kA8_Format; dst->fImage = NULL; if (src.fImage) { size_t dstSize = dst->computeImageSize(); if (0 == dstSize) { return false; // too big to allocate, abort } int srcWidth = src.fBounds.width(); int srcHeight = src.fBounds.height(); int dstWidth = dst->fBounds.width(); const uint8_t* srcPixels = src.fImage; uint8_t* dstPixels = SkMask::AllocImage(dstSize); SkAutoTCallVProc<uint8_t, SkMask_FreeImage> autoCall(dstPixels); // do the actual blur. First, make a padded copy of the source. // use double pad so we never have to check if we're outside anything int padWidth = srcWidth + 4*pad; int padHeight = srcHeight; int padSize = padWidth * padHeight; SkAutoTMalloc<uint8_t> padPixels(padSize); memset(padPixels, 0, padSize); for (int y = 0 ; y < srcHeight; ++y) { uint8_t* padptr = padPixels + y * padWidth + 2*pad; const uint8_t* srcptr = srcPixels + y * srcWidth; memcpy(padptr, srcptr, srcWidth); } // blur in X, transposing the result into a temporary floating point buffer. // also double-pad the intermediate result so that the second blur doesn't // have to do extra conditionals. int tmpWidth = padHeight + 4*pad; int tmpHeight = padWidth - 2*pad; int tmpSize = tmpWidth * tmpHeight; SkAutoTMalloc<float> tmpImage(tmpSize); memset(tmpImage, 0, tmpSize*sizeof(tmpImage[0])); for (int y = 0 ; y < padHeight ; ++y) { uint8_t *srcScanline = padPixels + y*padWidth; for (int x = pad ; x < padWidth - pad ; ++x) { float *outPixel = tmpImage + (x-pad)*tmpWidth + y + 2*pad; // transposed output uint8_t *windowCenter = srcScanline + x; for (int i = -pad ; i <= pad ; ++i) { *outPixel += gaussWindow[pad+i]*windowCenter[i]; } *outPixel /= windowSum; } } // blur in Y; now filling in the actual desired destination. We have to do // the transpose again; these transposes guarantee that we read memory in // linear order. for (int y = 0 ; y < tmpHeight ; ++y) { float *srcScanline = tmpImage + y*tmpWidth; for (int x = pad ; x < tmpWidth - pad ; ++x) { float *windowCenter = srcScanline + x; float finalValue = 0; for (int i = -pad ; i <= pad ; ++i) { finalValue += gaussWindow[pad+i]*windowCenter[i]; } finalValue /= windowSum; uint8_t *outPixel = dstPixels + (x-pad)*dstWidth + y; // transposed output int integerPixel = int(finalValue + 0.5f); *outPixel = SkClampMax( SkClampPos(integerPixel), 255 ); } } dst->fImage = dstPixels; // if need be, alloc the "real" dst (same size as src) and copy/merge // the blur into it (applying the src) if (style == kInner_Style) { // now we allocate the "real" dst, mirror the size of src size_t srcSize = src.computeImageSize(); if (0 == srcSize) { return false; // too big to allocate, abort } dst->fImage = SkMask::AllocImage(srcSize); merge_src_with_blur(dst->fImage, src.fRowBytes, srcPixels, src.fRowBytes, dstPixels + pad*dst->fRowBytes + pad, dst->fRowBytes, srcWidth, srcHeight); SkMask::FreeImage(dstPixels); } else if (style != kNormal_Style) { clamp_with_orig(dstPixels + pad*dst->fRowBytes + pad, dst->fRowBytes, srcPixels, src.fRowBytes, srcWidth, srcHeight, style); } (void)autoCall.detach(); } if (style == kInner_Style) { dst->fBounds = src.fBounds; // restore trimmed bounds dst->fRowBytes = src.fRowBytes; } return true; }
/** * sw and sh are the width and height of the src. Since the sum buffer * matches that, but has an extra row and col at the beginning (with zeros), * we can just use sw and sh as our "max" values for pinning coordinates * when sampling into sum[][] * * The inner loop is conceptually simple; we break it into several variants * to improve performance. Here's the original version: for (int x = 0; x < dw; ++x) { int px = SkClampPos(prev_x); int nx = SkFastMin32(next_x, sw); int ipx = SkClampPos(prev_x + 1); int inx = SkClampMax(next_x - 1, sw); uint32_t outerSum = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; uint32_t innerSum = sum[ipx+ipy] + sum[inx+iny] - sum[inx+ipy] - sum[ipx+iny]; *dst++ = SkToU8((outerSum * outerScale + innerSum * innerScale) >> 24); prev_x += 1; next_x += 1; } * The sections are: * left-hand section, where prev_x is clamped to 0 * center section, where neither prev_x nor next_x is clamped * right-hand section, where next_x is clamped to sw * On some operating systems, the center section is unrolled for additional * speedup. */ static void apply_kernel_interp(uint8_t dst[], int rx, int ry, const uint32_t sum[], int sw, int sh, U8CPU outerWeight) { SkASSERT(rx > 0 && ry > 0); SkASSERT(outerWeight <= 255); if (2*rx > sw) { kernel_interp_clamped(dst, rx, ry, sum, sw, sh, outerWeight); return; } int innerWeight = 255 - outerWeight; // round these guys up if they're bigger than 127 outerWeight += outerWeight >> 7; innerWeight += innerWeight >> 7; uint32_t outerScale = (outerWeight << 16) / ((2*rx + 1)*(2*ry + 1)); uint32_t innerScale = (innerWeight << 16) / ((2*rx - 1)*(2*ry - 1)); int sumStride = sw + 1; int dw = sw + 2*rx; int dh = sh + 2*ry; int prev_y = -2*ry; int next_y = 1; SkASSERT(2*rx <= dw - 2*rx); for (int y = 0; y < dh; ++y) { int py = SkClampPos(prev_y) * sumStride; int ny = SkFastMin32(next_y, sh) * sumStride; int ipy = SkClampPos(prev_y + 1) * sumStride; int iny = SkClampMax(next_y - 1, sh) * sumStride; int prev_x = -2*rx; int next_x = 1; int x = 0; for (; x < 2*rx; ++x) { SkASSERT(prev_x < 0); SkASSERT(next_x <= sw); int px = 0; int nx = next_x; int ipx = 0; int inx = next_x - 1; uint32_t outerSum = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; uint32_t innerSum = sum[ipx+ipy] + sum[inx+iny] - sum[inx+ipy] - sum[ipx+iny]; *dst++ = SkToU8((outerSum * outerScale + innerSum * innerScale) >> 24); prev_x += 1; next_x += 1; } int i0 = prev_x + py; int i1 = next_x + ny; int i2 = next_x + py; int i3 = prev_x + ny; int i4 = prev_x + 1 + ipy; int i5 = next_x - 1 + iny; int i6 = next_x - 1 + ipy; int i7 = prev_x + 1 + iny; #if UNROLL_KERNEL_LOOP for (; x < dw - 2*rx - 4; x += 4) { SkASSERT(prev_x >= 0); SkASSERT(next_x <= sw); uint32_t outerSum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; uint32_t innerSum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++]; *dst++ = SkToU8((outerSum * outerScale + innerSum * innerScale) >> 24); outerSum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; innerSum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++]; *dst++ = SkToU8((outerSum * outerScale + innerSum * innerScale) >> 24); outerSum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; innerSum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++]; *dst++ = SkToU8((outerSum * outerScale + innerSum * innerScale) >> 24); outerSum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; innerSum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++]; *dst++ = SkToU8((outerSum * outerScale + innerSum * innerScale) >> 24); prev_x += 4; next_x += 4; } #endif for (; x < dw - 2*rx; ++x) { SkASSERT(prev_x >= 0); SkASSERT(next_x <= sw); uint32_t outerSum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; uint32_t innerSum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++]; *dst++ = SkToU8((outerSum * outerScale + innerSum * innerScale) >> 24); prev_x += 1; next_x += 1; } for (; x < dw; ++x) { SkASSERT(prev_x >= 0); SkASSERT(next_x > sw); int px = prev_x; int nx = sw; int ipx = prev_x + 1; int inx = sw; uint32_t outerSum = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; uint32_t innerSum = sum[ipx+ipy] + sum[inx+iny] - sum[inx+ipy] - sum[ipx+iny]; *dst++ = SkToU8((outerSum * outerScale + innerSum * innerScale) >> 24); prev_x += 1; next_x += 1; } prev_y += 1; next_y += 1; } }
/** * sw and sh are the width and height of the src. Since the sum buffer * matches that, but has an extra row and col at the beginning (with zeros), * we can just use sw and sh as our "max" values for pinning coordinates * when sampling into sum[][] * * The inner loop is conceptually simple; we break it into several sections * to improve performance. Here's the original version: for (int x = 0; x < dw; ++x) { int px = SkClampPos(prev_x); int nx = SkFastMin32(next_x, sw); uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; *dst++ = SkToU8(tmp * scale >> 24); prev_x += 1; next_x += 1; } * The sections are: * left-hand section, where prev_x is clamped to 0 * center section, where neither prev_x nor next_x is clamped * right-hand section, where next_x is clamped to sw * On some operating systems, the center section is unrolled for additional * speedup. */ static void apply_kernel(uint8_t dst[], int rx, int ry, const uint32_t sum[], int sw, int sh) { if (2*rx > sw) { kernel_clamped(dst, rx, ry, sum, sw, sh); return; } uint32_t scale = (1 << 24) / ((2*rx + 1)*(2*ry + 1)); int sumStride = sw + 1; int dw = sw + 2*rx; int dh = sh + 2*ry; int prev_y = -2*ry; int next_y = 1; SkASSERT(2*rx <= dw - 2*rx); for (int y = 0; y < dh; ++y) { int py = SkClampPos(prev_y) * sumStride; int ny = SkFastMin32(next_y, sh) * sumStride; int prev_x = -2*rx; int next_x = 1; int x = 0; for (; x < 2*rx; ++x) { SkASSERT(prev_x <= 0); SkASSERT(next_x <= sw); int px = 0; int nx = next_x; uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; *dst++ = SkToU8(tmp * scale >> 24); prev_x += 1; next_x += 1; } int i0 = prev_x + py; int i1 = next_x + ny; int i2 = next_x + py; int i3 = prev_x + ny; #if UNROLL_KERNEL_LOOP for (; x < dw - 2*rx - 4; x += 4) { SkASSERT(prev_x >= 0); SkASSERT(next_x <= sw); uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; *dst++ = SkToU8(tmp * scale >> 24); tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; *dst++ = SkToU8(tmp * scale >> 24); tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; *dst++ = SkToU8(tmp * scale >> 24); tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; *dst++ = SkToU8(tmp * scale >> 24); prev_x += 4; next_x += 4; } #endif for (; x < dw - 2*rx; ++x) { SkASSERT(prev_x >= 0); SkASSERT(next_x <= sw); uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; *dst++ = SkToU8(tmp * scale >> 24); prev_x += 1; next_x += 1; } for (; x < dw; ++x) { SkASSERT(prev_x >= 0); SkASSERT(next_x > sw); int px = prev_x; int nx = sw; uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; *dst++ = SkToU8(tmp * scale >> 24); prev_x += 1; next_x += 1; } prev_y += 1; next_y += 1; } }