static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; const int K = top[-4]; const int L = top[-5]; WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J)); WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K)); WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L)); WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); }
static void HE4_C(uint8_t* dst) { // horizontal const int A = dst[-1 - BPS]; const int B = dst[-1]; const int C = dst[-1 + BPS]; const int D = dst[-1 + 2 * BPS]; const int E = dst[-1 + 3 * BPS]; WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(A, B, C)); WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(B, C, D)); WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(C, D, E)); WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E)); }
static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk, const uint8_t* src) { rescaler_t* frow = wrk->frow; const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels; const int x_add = wrk->x_add; int accum = x_add; __m128i cur_pixels; // SSE2 implementation only works with 16b signed arithmetic at max. if (wrk->src_width < 8 || accum >= (1 << 15)) { WebPRescalerImportRowExpand_C(wrk, src); return; } assert(!WebPRescalerInputDone(wrk)); assert(wrk->x_expand); if (wrk->num_channels == 4) { LoadTwoPixels_SSE2(src, &cur_pixels); src += 4; while (1) { const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum); const __m128i out = _mm_madd_epi16(cur_pixels, mult); _mm_storeu_si128((__m128i*)frow, out); frow += 4; if (frow >= frow_end) break; accum -= wrk->x_sub; if (accum < 0) { LoadTwoPixels_SSE2(src, &cur_pixels); src += 4; accum += x_add; } } } else { int left; const uint8_t* const src_limit = src + wrk->src_width - 8; LoadEightPixels_SSE2(src, &cur_pixels); src += 7; left = 7; while (1) { const __m128i mult = _mm_cvtsi32_si128(((x_add - accum) << 16) | accum); const __m128i out = _mm_madd_epi16(cur_pixels, mult); assert(sizeof(*frow) == sizeof(uint32_t)); WebPUint32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out)); frow += 1; if (frow >= frow_end) break; accum -= wrk->x_sub; if (accum < 0) { if (--left) { cur_pixels = _mm_srli_si128(cur_pixels, 2); } else if (src <= src_limit) { LoadEightPixels_SSE2(src, &cur_pixels); src += 7; left = 7; } else { // tail cur_pixels = _mm_srli_si128(cur_pixels, 2); cur_pixels = _mm_insert_epi16(cur_pixels, src[1], 1); src += 1; left = 1; } accum += x_add; } } } assert(accum == 0); }