void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) { const int x_stride = wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels; int channel; assert(!WebPRescalerInputDone(wrk)); assert(!wrk->x_expand); for (channel = 0; channel < x_stride; ++channel) { int x_in = channel; int x_out = channel; uint32_t sum = 0; int accum = 0; while (x_out < x_out_max) { uint32_t base = 0; accum += wrk->x_add; while (accum > 0) { accum -= wrk->x_sub; assert(x_in < wrk->src_width * x_stride); base = src[x_in]; sum += base; x_in += x_stride; } { // Emit next horizontal pixel. const rescaler_t frac = base * (-accum); wrk->frow[x_out] = sum * wrk->x_sub - frac; // fresh fractional start for next pixel sum = (int)MULT_FIX(frac, wrk->fx_scale); } x_out += x_stride; } assert(accum == 0); } }
void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) { const int x_stride = wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels; int channel; assert(!WebPRescalerInputDone(wrk)); assert(wrk->x_expand); for (channel = 0; channel < x_stride; ++channel) { int x_in = channel; int x_out = channel; // simple bilinear interpolation int accum = wrk->x_add; int left = src[x_in]; int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left; x_in += x_stride; while (1) { wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum; x_out += x_stride; if (x_out >= x_out_max) break; accum -= wrk->x_sub; if (accum < 0) { left = right; x_in += x_stride; assert(x_in < wrk->src_width * x_stride); right = src[x_in]; accum += wrk->x_add; } } assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0); } }
void WebPRescalerImportRow(WebPRescaler* const wrk, const uint8_t* src) { assert(!WebPRescalerInputDone(wrk)); if (!wrk->x_expand) { WebPRescalerImportRowShrink(wrk, src); } else { WebPRescalerImportRowExpand(wrk, src); } }
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, const uint8_t* src) { const int x_sub = wrk->x_sub; int accum = 0; const __m128i zero = _mm_setzero_si128(); const __m128i mult0 = _mm_set1_epi16(x_sub); const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale); const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); __m128i sum = zero; rescaler_t* frow = wrk->frow; const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width; if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) { WebPRescalerImportRowShrink_C(wrk, src); return; } assert(!WebPRescalerInputDone(wrk)); assert(!wrk->x_expand); for (; frow < frow_end; frow += 4) { __m128i base = zero; accum += wrk->x_add; while (accum > 0) { const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src)); src += 4; base = _mm_unpacklo_epi8(A, zero); // To avoid overflow, we need: base * x_add / x_sub < 32768 // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit. sum = _mm_add_epi16(sum, base); accum -= x_sub; } { // Emit next horizontal pixel. const __m128i mult = _mm_set1_epi16(-accum); const __m128i frac0 = _mm_mullo_epi16(base, mult); // 16b x 16b -> 32b const __m128i frac1 = _mm_mulhi_epu16(base, mult); const __m128i frac = _mm_unpacklo_epi16(frac0, frac1); // frac is 32b const __m128i A0 = _mm_mullo_epi16(sum, mult0); const __m128i A1 = _mm_mulhi_epu16(sum, mult0); const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // sum * x_sub const __m128i frow_out = _mm_sub_epi32(B0, frac); // sum * x_sub - frac const __m128i D0 = _mm_srli_epi64(frac, 32); const __m128i D1 = _mm_mul_epu32(frac, mult1); // 32b x 16b -> 64b const __m128i D2 = _mm_mul_epu32(D0, mult1); const __m128i E1 = _mm_add_epi64(D1, rounder); const __m128i E2 = _mm_add_epi64(D2, rounder); const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2)); const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2)); const __m128i G = _mm_unpacklo_epi32(F1, F2); sum = _mm_packs_epi32(G, zero); _mm_storeu_si128((__m128i*)frow, frow_out); } } assert(accum == 0); }
static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk, const uint8_t* src) { rescaler_t* frow = wrk->frow; const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels; const int x_add = wrk->x_add; int accum = x_add; __m128i cur_pixels; // SSE2 implementation only works with 16b signed arithmetic at max. if (wrk->src_width < 8 || accum >= (1 << 15)) { WebPRescalerImportRowExpand_C(wrk, src); return; } assert(!WebPRescalerInputDone(wrk)); assert(wrk->x_expand); if (wrk->num_channels == 4) { LoadTwoPixels_SSE2(src, &cur_pixels); src += 4; while (1) { const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum); const __m128i out = _mm_madd_epi16(cur_pixels, mult); _mm_storeu_si128((__m128i*)frow, out); frow += 4; if (frow >= frow_end) break; accum -= wrk->x_sub; if (accum < 0) { LoadTwoPixels_SSE2(src, &cur_pixels); src += 4; accum += x_add; } } } else { int left; const uint8_t* const src_limit = src + wrk->src_width - 8; LoadEightPixels_SSE2(src, &cur_pixels); src += 7; left = 7; while (1) { const __m128i mult = _mm_cvtsi32_si128(((x_add - accum) << 16) | accum); const __m128i out = _mm_madd_epi16(cur_pixels, mult); assert(sizeof(*frow) == sizeof(uint32_t)); WebPUint32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out)); frow += 1; if (frow >= frow_end) break; accum -= wrk->x_sub; if (accum < 0) { if (--left) { cur_pixels = _mm_srli_si128(cur_pixels, 2); } else if (src <= src_limit) { LoadEightPixels_SSE2(src, &cur_pixels); src += 7; left = 7; } else { // tail cur_pixels = _mm_srli_si128(cur_pixels, 2); cur_pixels = _mm_insert_epi16(cur_pixels, src[1], 1); src += 1; left = 1; } accum += x_add; } } } assert(accum == 0); }