bool BGRAConvolve2D(const unsigned char* sourceData, int sourceByteRowStride, bool sourceHasAlpha, const SkConvolutionFilter1D& filterX, const SkConvolutionFilter1D& filterY, int outputByteRowStride, unsigned char* output) { int maxYFilterSize = filterY.maxFilter(); // The next row in the input that we will generate a horizontally // convolved row for. If the filter doesn't start at the beginning of the // image (this is the case when we are only resizing a subset), then we // don't want to generate any output rows before that. Compute the starting // row for convolution as the first pixel for the first vertical filter. int filterOffset, filterLength; const SkConvolutionFilter1D::ConvolutionFixed* filterValues = filterY.FilterForValue(0, &filterOffset, &filterLength); int nextXRow = filterOffset; // We loop over each row in the input doing a horizontal convolution. This // will result in a horizontally convolved image. We write the results into // a circular buffer of convolved rows and do vertical convolution as rows // are available. This prevents us from having to store the entire // intermediate image and helps cache coherency. // We will need four extra rows to allow horizontal convolution could be done // simultaneously. We also pad each row in row buffer to be aligned-up to // 32 bytes. // TODO(jiesun): We do not use aligned load from row buffer in vertical // convolution pass yet. Somehow Windows does not like it. int rowBufferWidth = (filterX.numValues() + 31) & ~0x1F; int rowBufferHeight = maxYFilterSize + (SkOpts::convolve_4_rows_horizontally != nullptr ? 4 : 0); // check for too-big allocation requests : crbug.com/528628 { int64_t size = sk_64_mul(rowBufferWidth, rowBufferHeight); // need some limit, to avoid over-committing success from malloc, but then // crashing when we try to actually use the memory. // 100meg seems big enough to allow "normal" zoom factors and image sizes through // while avoiding the crash seen by the bug (crbug.com/528628) if (size > 100 * 1024 * 1024) { // SkDebugf("BGRAConvolve2D: tmp allocation [%lld] too big\n", size); return false; } } CircularRowBuffer rowBuffer(rowBufferWidth, rowBufferHeight, filterOffset); // Loop over every possible output row, processing just enough horizontal // convolutions to run each subsequent vertical convolution. SkASSERT(outputByteRowStride >= filterX.numValues() * 4); int numOutputRows = filterY.numValues(); // We need to check which is the last line to convolve before we advance 4 // lines in one iteration. int lastFilterOffset, lastFilterLength; filterY.FilterForValue(numOutputRows - 1, &lastFilterOffset, &lastFilterLength); for (int outY = 0; outY < numOutputRows; outY++) { filterValues = filterY.FilterForValue(outY, &filterOffset, &filterLength); // Generate output rows until we have enough to run the current filter. while (nextXRow < filterOffset + filterLength) { if (SkOpts::convolve_4_rows_horizontally != nullptr && nextXRow + 3 < lastFilterOffset + lastFilterLength) { const unsigned char* src[4]; unsigned char* outRow[4]; for (int i = 0; i < 4; ++i) { src[i] = &sourceData[(uint64_t)(nextXRow + i) * sourceByteRowStride]; outRow[i] = rowBuffer.advanceRow(); } SkOpts::convolve_4_rows_horizontally(src, filterX, outRow, 4*rowBufferWidth); nextXRow += 4; } else { SkOpts::convolve_horizontally( &sourceData[(uint64_t)nextXRow * sourceByteRowStride], filterX, rowBuffer.advanceRow(), sourceHasAlpha); nextXRow++; } } // Compute where in the output image this row of final data will go. unsigned char* curOutputRow = &output[(uint64_t)outY * outputByteRowStride]; // Get the list of rows that the circular buffer has, in order. int firstRowInCircularBuffer; unsigned char* const* rowsToConvolve = rowBuffer.GetRowAddresses(&firstRowInCircularBuffer); // Now compute the start of the subset of those rows that the filter needs. unsigned char* const* firstRowForFilter = &rowsToConvolve[filterOffset - firstRowInCircularBuffer]; SkOpts::convolve_vertically(filterValues, filterLength, firstRowForFilter, filterX.numValues(), curOutputRow, sourceHasAlpha); } return true; }
void BGRAConvolve2D(const unsigned char* sourceData, int sourceByteRowStride, bool sourceHasAlpha, const SkConvolutionFilter1D& filterX, const SkConvolutionFilter1D& filterY, int outputByteRowStride, unsigned char* output, const SkConvolutionProcs& convolveProcs, bool useSimdIfPossible) { int maxYFilterSize = filterY.maxFilter(); // The next row in the input that we will generate a horizontally // convolved row for. If the filter doesn't start at the beginning of the // image (this is the case when we are only resizing a subset), then we // don't want to generate any output rows before that. Compute the starting // row for convolution as the first pixel for the first vertical filter. int filterOffset, filterLength; const SkConvolutionFilter1D::ConvolutionFixed* filterValues = filterY.FilterForValue(0, &filterOffset, &filterLength); int nextXRow = filterOffset; // We loop over each row in the input doing a horizontal convolution. This // will result in a horizontally convolved image. We write the results into // a circular buffer of convolved rows and do vertical convolution as rows // are available. This prevents us from having to store the entire // intermediate image and helps cache coherency. // We will need four extra rows to allow horizontal convolution could be done // simultaneously. We also pad each row in row buffer to be aligned-up to // 16 bytes. // TODO(jiesun): We do not use aligned load from row buffer in vertical // convolution pass yet. Somehow Windows does not like it. int rowBufferWidth = (filterX.numValues() + 15) & ~0xF; int rowBufferHeight = maxYFilterSize + (convolveProcs.fConvolve4RowsHorizontally ? 4 : 0); CircularRowBuffer rowBuffer(rowBufferWidth, rowBufferHeight, filterOffset); // Loop over every possible output row, processing just enough horizontal // convolutions to run each subsequent vertical convolution. SkASSERT(outputByteRowStride >= filterX.numValues() * 4); int numOutputRows = filterY.numValues(); // We need to check which is the last line to convolve before we advance 4 // lines in one iteration. int lastFilterOffset, lastFilterLength; // SSE2 can access up to 3 extra pixels past the end of the // buffer. At the bottom of the image, we have to be careful // not to access data past the end of the buffer. Normally // we fall back to the C++ implementation for the last row. // If the last row is less than 3 pixels wide, we may have to fall // back to the C++ version for more rows. Compute how many // rows we need to avoid the SSE implementation for here. filterX.FilterForValue(filterX.numValues() - 1, &lastFilterOffset, &lastFilterLength); int avoidSimdRows = 1 + convolveProcs.fExtraHorizontalReads / (lastFilterOffset + lastFilterLength); filterY.FilterForValue(numOutputRows - 1, &lastFilterOffset, &lastFilterLength); for (int outY = 0; outY < numOutputRows; outY++) { filterValues = filterY.FilterForValue(outY, &filterOffset, &filterLength); // Generate output rows until we have enough to run the current filter. while (nextXRow < filterOffset + filterLength) { if (convolveProcs.fConvolve4RowsHorizontally && nextXRow + 3 < lastFilterOffset + lastFilterLength - avoidSimdRows) { const unsigned char* src[4]; unsigned char* outRow[4]; for (int i = 0; i < 4; ++i) { src[i] = &sourceData[(uint64_t)(nextXRow + i) * sourceByteRowStride]; outRow[i] = rowBuffer.advanceRow(); } convolveProcs.fConvolve4RowsHorizontally(src, filterX, outRow, 4*rowBufferWidth); nextXRow += 4; } else { // Check if we need to avoid SSE2 for this row. if (convolveProcs.fConvolveHorizontally && nextXRow < lastFilterOffset + lastFilterLength - avoidSimdRows) { convolveProcs.fConvolveHorizontally( &sourceData[(uint64_t)nextXRow * sourceByteRowStride], filterX, rowBuffer.advanceRow(), sourceHasAlpha); } else { if (sourceHasAlpha) { ConvolveHorizontallyAlpha( &sourceData[(uint64_t)nextXRow * sourceByteRowStride], filterX, rowBuffer.advanceRow()); } else { ConvolveHorizontallyNoAlpha( &sourceData[(uint64_t)nextXRow * sourceByteRowStride], filterX, rowBuffer.advanceRow()); } } nextXRow++; } } // Compute where in the output image this row of final data will go. unsigned char* curOutputRow = &output[(uint64_t)outY * outputByteRowStride]; // Get the list of rows that the circular buffer has, in order. int firstRowInCircularBuffer; unsigned char* const* rowsToConvolve = rowBuffer.GetRowAddresses(&firstRowInCircularBuffer); // Now compute the start of the subset of those rows that the filter // needs. unsigned char* const* firstRowForFilter = &rowsToConvolve[filterOffset - firstRowInCircularBuffer]; if (convolveProcs.fConvolveVertically) { convolveProcs.fConvolveVertically(filterValues, filterLength, firstRowForFilter, filterX.numValues(), curOutputRow, sourceHasAlpha); } else { ConvolveVertically(filterValues, filterLength, firstRowForFilter, filterX.numValues(), curOutputRow, sourceHasAlpha); } } }