template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int16_t(void *_idata, const int istride, const char *odata, const int ostride, const int iwidth, const int iheight, const int ooffset_x, const int ooffset_y, const int owidth, const int oheight) { int16_t *idata = (int16_t *)_idata; const int skip = 1; const __m128i ONE = _mm_set1_epi16(1); const __m128i OFFSET = _mm_set1_epi16(1 << (active_bits - 1)); const __m128i SHUF = _mm_set_epi8(15,14, 11,10, 7,6, 3,2, 13,12, 9,8, 5,4, 1,0); const __m128i CLIP = _mm_set1_epi16((1 << active_bits) - 1); const __m128i ZERO = _mm_set1_epi16(0); (void)iwidth; (void)iheight; for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) { for (int x = ooffset_x; x < ooffset_x + owidth; x += 16) { __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]); __m128i D8 = _mm_load_si128((__m128i *)&idata[y*istride + x + 8]); D0 = _mm_shuffle_epi8(D0, SHUF); D8 = _mm_shuffle_epi8(D8, SHUF); __m128i E0 = _mm_unpacklo_epi64(D0, D8); __m128i O1 = _mm_unpackhi_epi64(D0, D8); __m128i X0 = _mm_sub_epi16(E0, _mm_srai_epi16(_mm_add_epi16(O1, ONE), 1)); __m128i X1 = _mm_add_epi16(O1, X0); __m128i Z0 = _mm_unpacklo_epi16(X0, X1); __m128i Z8 = _mm_unpackhi_epi16(X0, X1); if (shift != 0) { Z0 = _mm_add_epi16(Z0, ONE); Z8 = _mm_add_epi16(Z8, ONE); Z0 = _mm_srai_epi16(Z0, shift); Z8 = _mm_srai_epi16(Z8, shift); } Z0 = _mm_add_epi16(Z0, OFFSET); Z8 = _mm_add_epi16(Z8, OFFSET); Z0 = _mm_min_epi16(Z0, CLIP); Z8 = _mm_min_epi16(Z8, CLIP); Z0 = _mm_max_epi16(Z0, ZERO); Z8 = _mm_max_epi16(Z8, ZERO); _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 0 - ooffset_x)], Z0); _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 8 - ooffset_x)], Z8); } } }
SIMDValue SIMDInt8x16Operation::OpShiftRightByScalar(const SIMDValue& value, int8 count) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(value); X86SIMDValue x86tmp1; const _x86_SIMDValue X86_LOWBYTE_MASK = { 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff }; const _x86_SIMDValue X86_HIGHBYTE_MASK = { 0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00 }; if (count < 0 || count > 8) { count = 8; } x86tmp1.m128i_value = _mm_slli_epi16(tmpaValue.m128i_value, 8); x86tmp1.m128i_value = _mm_srai_epi16(x86tmp1.m128i_value, count + 8); x86tmp1.m128i_value = _mm_and_si128(x86tmp1.m128i_value, X86_LOWBYTE_MASK.m128i_value); tmpaValue.m128i_value = _mm_srai_epi16(tmpaValue.m128i_value, count); tmpaValue.m128i_value = _mm_and_si128(tmpaValue.m128i_value, X86_HIGHBYTE_MASK.m128i_value); x86Result.m128i_value = _mm_or_si128(tmpaValue.m128i_value, x86tmp1.m128i_value); return X86SIMDValue::ToSIMDValue(x86Result); }
void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src, int src_stride, uint16_t *dst, int dst_stride, int bd) { __m128i u[4], v[4]; const __m128i ones = _mm_set1_epi16(1); transClipPixel(src, src_stride, u, bd); v[0] = _mm_loadl_epi64((__m128i const *)dst); v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride)); v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride)); v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride)); u[0] = _mm_add_epi16(u[0], v[0]); u[1] = _mm_add_epi16(u[1], v[1]); u[2] = _mm_add_epi16(u[2], v[2]); u[3] = _mm_add_epi16(u[3], v[3]); u[0] = _mm_add_epi16(u[0], ones); u[1] = _mm_add_epi16(u[1], ones); u[2] = _mm_add_epi16(u[2], ones); u[3] = _mm_add_epi16(u[3], ones); u[0] = _mm_srai_epi16(u[0], 1); u[1] = _mm_srai_epi16(u[1], 1); u[2] = _mm_srai_epi16(u[2], 1); u[3] = _mm_srai_epi16(u[3], 1); writePixel(u, width, pixelsNum, dst, dst_stride); }
// Shift each byte of "x" by 3 bits while preserving by the sign bit. static WEBP_INLINE void SignedShift8b(__m128i* const x) { const __m128i zero = _mm_setzero_si128(); const __m128i signs = _mm_cmpgt_epi8(zero, *x); const __m128i lo_0 = _mm_unpacklo_epi8(*x, signs); // s8 -> s16 sign extend const __m128i hi_0 = _mm_unpackhi_epi8(*x, signs); const __m128i lo_1 = _mm_srai_epi16(lo_0, 3); const __m128i hi_1 = _mm_srai_epi16(hi_0, 3); *x = _mm_packs_epi16(lo_1, hi_1); }
static INLINE void hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff, int is_final) { // For high bitdepths, it is unnecessary to store_tran_low // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the // next stage. Output to an intermediate buffer first, then store_tran_low() // in the final stage. DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); int16_t *t_coeff = temp_coeff; int16_t *coeff16 = (int16_t *)coeff; int idx; for (idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64), 0); } for (idx = 0; idx < 64; idx += 8) { __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64)); __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128)); __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192)); __m128i b0 = _mm_add_epi16(coeff0, coeff1); __m128i b1 = _mm_sub_epi16(coeff0, coeff1); __m128i b2 = _mm_add_epi16(coeff2, coeff3); __m128i b3 = _mm_sub_epi16(coeff2, coeff3); b0 = _mm_srai_epi16(b0, 1); b1 = _mm_srai_epi16(b1, 1); b2 = _mm_srai_epi16(b2, 1); b3 = _mm_srai_epi16(b3, 1); coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); coeff2 = _mm_sub_epi16(b0, b2); coeff3 = _mm_sub_epi16(b1, b3); if (is_final) { store_tran_low(coeff0, coeff); store_tran_low(coeff1, coeff + 64); store_tran_low(coeff2, coeff + 128); store_tran_low(coeff3, coeff + 192); coeff += 8; } else { _mm_store_si128((__m128i *)coeff16, coeff0); _mm_store_si128((__m128i *)(coeff16 + 64), coeff1); _mm_store_si128((__m128i *)(coeff16 + 128), coeff2); _mm_store_si128((__m128i *)(coeff16 + 192), coeff3); coeff16 += 8; } t_coeff += 8; } }
rfx_dwt_2d_encode_block_vert_sse2(INT16* src, INT16* l, INT16* h, int subband_width) { int total_width; int x; int n; __m128i src_2n; __m128i src_2n_1; __m128i src_2n_2; __m128i h_n; __m128i h_n_m; __m128i l_n; total_width = subband_width << 1; for (n = 0; n < subband_width; n++) { for (x = 0; x < total_width; x += 8) { src_2n = _mm_load_si128((__m128i*) src); src_2n_1 = _mm_load_si128((__m128i*) (src + total_width)); if (n < subband_width - 1) src_2n_2 = _mm_load_si128((__m128i*) (src + 2 * total_width)); else src_2n_2 = src_2n; /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */ h_n = _mm_add_epi16(src_2n, src_2n_2); h_n = _mm_srai_epi16(h_n, 1); h_n = _mm_sub_epi16(src_2n_1, h_n); h_n = _mm_srai_epi16(h_n, 1); _mm_store_si128((__m128i*) h, h_n); if (n == 0) h_n_m = h_n; else h_n_m = _mm_load_si128((__m128i*) (h - total_width)); /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */ l_n = _mm_add_epi16(h_n_m, h_n); l_n = _mm_srai_epi16(l_n, 1); l_n = _mm_add_epi16(l_n, src_2n); _mm_store_si128((__m128i*) l, l_n); src += 8; l += 8; h += 8; } src += total_width; } }
static void aom_filter_block1d8_h4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32, filt2Reg, filt3Reg; __m128i secondFilters, thirdFilters; __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; __m128i srcReg32b1; unsigned int i; src_ptr -= 3; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32)); filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2)); for (i = output_height; i > 0; i -= 1) { srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); // filter the source buffer srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); // shrink to 8 bit each 16 bits srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); src_ptr += src_pixels_per_line; _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1); output_ptr += output_pitch; } }
// Updates values of 2 pixels at MB edge during complex filtering. // Update operations: // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)] // Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip). static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi, const __m128i* const a0_lo, const __m128i* const a0_hi) { const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7); const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7); const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi); const __m128i sign_bit = _mm_set1_epi8(0x80); *pi = _mm_adds_epi8(*pi, delta); *qi = _mm_subs_epi8(*qi, delta); FLIP_SIGN_BIT2(*pi, *qi); }
/* Compute branch metrics (gamma) */ void map_gen_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb) { __m128i res10, res20, res11, res21, res1, res2; __m128i in, ap, pa, g1, g0; __m128i *inPtr = (__m128i*) input; __m128i *appPtr = (__m128i*) app; __m128i *paPtr = (__m128i*) parity; __m128i *resPtr = (__m128i*) h->branch; __m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0); __m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); __m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff); __m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff); for (int i=0;i<long_cb/8;i++) { in = _mm_load_si128(inPtr); inPtr++; pa = _mm_load_si128(paPtr); paPtr++; if (appPtr) { ap = _mm_load_si128(appPtr); appPtr++; in = _mm_add_epi16(ap, in); } g1 = _mm_add_epi16(in, pa); g0 = _mm_sub_epi16(in, pa); g1 = _mm_srai_epi16(g1, 1); g0 = _mm_srai_epi16(g0, 1); res10 = _mm_shuffle_epi8(g0, res10_mask); res20 = _mm_shuffle_epi8(g0, res20_mask); res11 = _mm_shuffle_epi8(g1, res11_mask); res21 = _mm_shuffle_epi8(g1, res21_mask); res1 = _mm_or_si128(res10, res11); res2 = _mm_or_si128(res20, res21); _mm_store_si128(resPtr, res1); resPtr++; _mm_store_si128(resPtr, res2); resPtr++; } for (int i=long_cb;i<long_cb+3;i++) { h->branch[2*i] = (input[i] - parity[i])/2; h->branch[2*i+1] = (input[i] + parity[i])/2; } }
rfx_dwt_2d_encode_block_horiz_sse2(INT16* src, INT16* l, INT16* h, int subband_width) { int y; int n; int first; __m128i src_2n; __m128i src_2n_1; __m128i src_2n_2; __m128i h_n; __m128i h_n_m; __m128i l_n; for (y = 0; y < subband_width; y++) { for (n = 0; n < subband_width; n += 8) { /* The following 3 Set operations consumes more than half of the total DWT processing time! */ src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]); src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]); src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[14] : src[16], src[14], src[12], src[10], src[8], src[6], src[4], src[2]); /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */ h_n = _mm_add_epi16(src_2n, src_2n_2); h_n = _mm_srai_epi16(h_n, 1); h_n = _mm_sub_epi16(src_2n_1, h_n); h_n = _mm_srai_epi16(h_n, 1); _mm_store_si128((__m128i*) h, h_n); h_n_m = _mm_loadu_si128((__m128i*) (h - 1)); if (n == 0) { first = _mm_extract_epi16(h_n_m, 1); h_n_m = _mm_insert_epi16(h_n_m, first, 0); } /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */ l_n = _mm_add_epi16(h_n_m, h_n); l_n = _mm_srai_epi16(l_n, 1); l_n = _mm_add_epi16(l_n, src_2n); _mm_store_si128((__m128i*) l, l_n); src += 16; l += 8; h += 8; } } }
static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width, int source_dx) { __m128i xmm0, xmmY1, xmmY2; __m128 xmmY; uint8 u, v, y; int x = 0; while (width >= 2) { u = u_buf[x >> 17]; v = v_buf[x >> 17]; y = y_buf[x >> 16]; x += source_dx; xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); y = y_buf[x >> 16]; x += source_dx; xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY2 = _mm_adds_epi16(xmmY2, xmm0); xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 0x44); xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); rgb_buf += 8; width -= 2; } if (width) { u = u_buf[x >> 17]; v = v_buf[x >> 17]; y = y_buf[x >> 16]; xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); xmmY1 = _mm_srai_epi16(xmmY1, 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); } }
// Compute the sum of all pixel differences of this MB. static INLINE int sum_diff_16x1(__m128i acc_diff) { const __m128i k_1 = _mm_set1_epi16(1); const __m128i acc_diff_lo = _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8); const __m128i acc_diff_hi = _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8); const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); return _mm_cvtsi128_si32(hgfedcba); }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp, int th) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *orig = p0, *end = p2; line_copy8(p0, srcp + stride, width, 1); line_copy8(p1, srcp, width, 1); uint8_t threshold = (uint8_t)th; __m128i zero = _mm_setzero_si128(); __m128i xth = _mm_set1_epi8((int8_t)threshold); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 1 ? 1 : -1); line_copy8(p2, srcp, width, 1); uint8_t *coordinates[] = COORDINATES; for (int x = 0; x < width; x += 16) { __m128i sumlo = zero; __m128i sumhi = zero; for (int i = 0; i < 8; i++) { __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x)); sumlo = _mm_add_epi16(sumlo, _mm_unpacklo_epi8(target, zero)); sumhi = _mm_add_epi16(sumhi, _mm_unpackhi_epi8(target, zero)); } sumlo = _mm_srai_epi16(sumlo, 3); sumhi = _mm_srai_epi16(sumhi, 3); sumlo = _mm_packus_epi16(sumlo, sumhi); __m128i src = _mm_load_si128((__m128i *)(p1 + x)); __m128i limit = _mm_adds_epu8(src, xth); sumlo = _mm_max_epu8(sumlo, src); sumlo = _mm_min_epu8(sumlo, limit); _mm_store_si128((__m128i *)(dstp + x), sumlo); } dstp += stride; p0 = p1; p1 = p2; p2 = (p2 == end) ? orig : p2 + bstride; } }
// Predictors13: ClampedAddSubtractHalf static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; const __m128i zero = _mm_setzero_si128(); for (i = 0; i + 2 <= num_pixels; i += 2) { // we can only process two pixels at a time const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]); const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]); const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]); const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]); const __m128i L_lo = _mm_unpacklo_epi8(L, zero); const __m128i T_lo = _mm_unpacklo_epi8(T, zero); const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero); const __m128i sum = _mm_add_epi16(T_lo, L_lo); const __m128i avg = _mm_srli_epi16(sum, 1); const __m128i A1 = _mm_sub_epi16(avg, TL_lo); const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg); const __m128i A2 = _mm_sub_epi16(A1, bit_fix); const __m128i A3 = _mm_srai_epi16(A2, 1); const __m128i A4 = _mm_add_epi16(avg, A3); const __m128i pred = _mm_packus_epi16(A4, A4); const __m128i res = _mm_sub_epi8(src, pred); _mm_storel_epi64((__m128i*)&out[i], res); } if (i != num_pixels) { VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i); } }
void srslte_vec_sc_div2_sss_simd(short *x, int k, short *z, uint32_t len) { #ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int points = len / 8; const __m128i* xPtr = (const __m128i*) x; __m128i* zPtr = (__m128i*) z; __m128i xVal, zVal; for(;number < points; number++){ xVal = _mm_load_si128(xPtr); zVal = _mm_srai_epi16(xVal, k); _mm_store_si128(zPtr, zVal); xPtr ++; zPtr ++; } number = points * 8; short divn = (1<<k); for(;number < len; number++){ z[number] = x[number] / divn; } #endif }
void ff_hevc_transform_skip_8_sse(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride) { uint8_t *dst = (uint8_t*)_dst; ptrdiff_t stride = _stride; int shift = 5; int offset = 16; __m128i r0, r1, r2, r3, r4, r5, r6, r9; r9 = _mm_setzero_si128(); r2 = _mm_set1_epi16(offset); r0 = _mm_load_si128((__m128i*)(coeffs)); r1 = _mm_load_si128((__m128i*)(coeffs + 8)); r0 = _mm_adds_epi16(r0, r2); r1 = _mm_adds_epi16(r1, r2); r0 = _mm_srai_epi16(r0, shift); r1 = _mm_srai_epi16(r1, shift); r3 = _mm_loadl_epi64((__m128i*)(dst)); r4 = _mm_loadl_epi64((__m128i*)(dst + stride)); r5 = _mm_loadl_epi64((__m128i*)(dst + 2 * stride)); r6 = _mm_loadl_epi64((__m128i*)(dst + 3 * stride)); r3 = _mm_unpacklo_epi8(r3, r9); r4 = _mm_unpacklo_epi8(r4, r9); r5 = _mm_unpacklo_epi8(r5, r9); r6 = _mm_unpacklo_epi8(r6, r9); r3 = _mm_unpacklo_epi64(r3, r4); r4 = _mm_unpacklo_epi64(r5, r6); r3 = _mm_adds_epi16(r3, r0); r4 = _mm_adds_epi16(r4, r1); r3 = _mm_packus_epi16(r3, r4); *((uint32_t *)(dst)) = _mm_cvtsi128_si32(r3); dst+=stride; *((uint32_t *)(dst)) = _mm_cvtsi128_si32(_mm_srli_si128(r3, 4)); dst+=stride; *((uint32_t *)(dst)) = _mm_cvtsi128_si32(_mm_srli_si128(r3, 8)); dst+=stride; *((uint32_t *)(dst)) = _mm_cvtsi128_si32(_mm_srli_si128(r3, 12)); }
__m128i test_mm_srai_epi16(__m128i A) { // DAG-LABEL: test_mm_srai_epi16 // DAG: call <8 x i16> @llvm.x86.sse2.psrai.w // // ASM-LABEL: test_mm_srai_epi16 // ASM: psraw return _mm_srai_epi16(A, 1); }
void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { // For high bitdepths, it is unnecessary to store_tran_low // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the // next stage. Output to an intermediate buffer first, then store_tran_low() // in the final stage. DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); int16_t *t_coeff = temp_coeff; int idx; for (idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; hadamard_16x16_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 256), 0); } for (idx = 0; idx < 256; idx += 8) { __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256)); __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512)); __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768)); __m128i b0 = _mm_add_epi16(coeff0, coeff1); __m128i b1 = _mm_sub_epi16(coeff0, coeff1); __m128i b2 = _mm_add_epi16(coeff2, coeff3); __m128i b3 = _mm_sub_epi16(coeff2, coeff3); b0 = _mm_srai_epi16(b0, 2); b1 = _mm_srai_epi16(b1, 2); b2 = _mm_srai_epi16(b2, 2); b3 = _mm_srai_epi16(b3, 2); coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); store_tran_low(coeff0, coeff); store_tran_low(coeff1, coeff + 256); coeff2 = _mm_sub_epi16(b0, b2); coeff3 = _mm_sub_epi16(b1, b3); store_tran_low(coeff2, coeff + 512); store_tran_low(coeff3, coeff + 768); coeff += 8; t_coeff += 8; } }
void Convert444to420(LPBYTE input, int width, int pitch, int height, int startY, int endY, LPBYTE *output, bool bSSE2Available) { LPBYTE lumPlane = output[0]; LPBYTE uPlane = output[1]; LPBYTE vPlane = output[2]; int chrPitch = width>>1; if(bSSE2Available) { __m128i lumMask = _mm_set1_epi32(0x0000FF00); __m128i uvMask = _mm_set1_epi16(0x00FF); for(int y=startY; y<endY; y+=2) { int yPos = y*pitch; int chrYPos = ((y>>1)*chrPitch); int lumYPos = y*width; for(int x=0; x<width; x+=4) { LPBYTE lpImagePos = input+yPos+(x*4); int chrPos = chrYPos + (x>>1); int lumPos0 = lumYPos + x; int lumPos1 = lumPos0+width; __m128i line1 = _mm_load_si128((__m128i*)lpImagePos); __m128i line2 = _mm_load_si128((__m128i*)(lpImagePos+pitch)); //pack lum vals { __m128i packVal = _mm_packs_epi32(_mm_srli_si128(_mm_and_si128(line1, lumMask), 1), _mm_srli_si128(_mm_and_si128(line2, lumMask), 1)); packVal = _mm_packus_epi16(packVal, packVal); *(LPUINT)(lumPlane+lumPos0) = packVal.m128i_u32[0]; *(LPUINT)(lumPlane+lumPos1) = packVal.m128i_u32[1]; } //do average, pack UV vals { __m128i addVal = _mm_add_epi64(_mm_and_si128(line1, uvMask), _mm_and_si128(line2, uvMask)); __m128i avgVal = _mm_srai_epi16(_mm_add_epi64(addVal, _mm_shuffle_epi32(addVal, _MM_SHUFFLE(2, 3, 0, 1))), 2); avgVal = _mm_shuffle_epi32(avgVal, _MM_SHUFFLE(3, 1, 2, 0)); avgVal = _mm_shufflelo_epi16(avgVal, _MM_SHUFFLE(3, 1, 2, 0)); avgVal = _mm_packus_epi16(avgVal, avgVal); DWORD packedVals = avgVal.m128i_u32[0]; *(LPWORD)(uPlane+chrPos) = WORD(packedVals); *(LPWORD)(vPlane+chrPos) = WORD(packedVals>>16); } } } } else { #ifdef _WIN64 for(int y=startY; y<endY; y+=2)
static void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr, ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel) { __m128i kernel_reg; // Kernel __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding int h; __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; __m128i dst_first; __m128i even, odd; // Start one pixel before as we need tap/2 - 1 = 1 sample from the past src_ptr -= 1; // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg = _mm_srai_epi16(kernel_reg, 1); kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); for (h = height; h > 0; --h) { // We will load multiple shifted versions of the row and shuffle them into // 16-bit words of the form // ... s[2] s[1] s[0] s[-1] // ... s[4] s[3] s[2] s[1] // Then we call multiply and add to get partial results // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] // The two results are then added together to get the even output src_reg = _mm_loadu_si128((const __m128i *)src_ptr); src_reg_shift_1 = _mm_srli_si128(src_reg, 1); src_reg_shift_2 = _mm_srli_si128(src_reg, 2); src_reg_shift_3 = _mm_srli_si128(src_reg, 3); // Output 6 4 2 0 even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, &kernel_reg_45); // Output 7 5 3 1 odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, &kernel_reg_23, &kernel_reg_45); // Combine to get the first half of the dst dst_first = mm_zip_epi32_sse2(&even, &odd); dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); // Saturate and convert to 8-bit words dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); _mm_storel_epi64((__m128i *)dst_ptr, dst_first); src_ptr += src_stride; dst_ptr += dst_stride; } }
SIMDValue SIMDInt16x8Operation::OpShiftRightByScalar(const SIMDValue& value, int count) { X86SIMDValue x86Result; X86SIMDValue tmpValue = X86SIMDValue::ToX86SIMDValue(value); // Shifts the 8 signed 16-bit integers right by count bits while shifting in the sign bit x86Result.m128i_value = _mm_srai_epi16(tmpValue.m128i_value, count); return X86SIMDValue::ToSIMDValue(x86Result); }
static void aom_filter_block1d4_h4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; unsigned int i; src_ptr -= 3; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); filt1Reg = _mm_load_si128((__m128i const *)(filtd4)); for (i = output_height; i > 0; i -= 1) { // load the 2 strides of source srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); // filter the source buffer srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg); // multiply 4 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve result srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); src_ptr += src_pixels_per_line; *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); output_ptr += output_pitch; } }
__m64 _m_psrawi(__m64 _M, int _Count) { __m128i lhs = {0}; lhs.m128i_i64[0] = _M.m64_i64; lhs = _mm_srai_epi16(lhs, _Count); _M.m64_i64 = lhs.m128i_i64[0]; return _M; }
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, int start_block, int end_block, VP8Histogram* const histo) { const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; for (j = start_block; j < end_block; ++j) { int16_t out[16]; int k; VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); // Convert coefficients to bin (within out[]). { // Load. const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]); const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]); // sign(out) = out >> 15 (0x0000 if positive, 0xffff if negative) const __m128i sign0 = _mm_srai_epi16(out0, 15); const __m128i sign1 = _mm_srai_epi16(out1, 15); // abs(out) = (out ^ sign) - sign const __m128i xor0 = _mm_xor_si128(out0, sign0); const __m128i xor1 = _mm_xor_si128(out1, sign1); const __m128i abs0 = _mm_sub_epi16(xor0, sign0); const __m128i abs1 = _mm_sub_epi16(xor1, sign1); // v = abs(out) >> 3 const __m128i v0 = _mm_srai_epi16(abs0, 3); const __m128i v1 = _mm_srai_epi16(abs1, 3); // bin = min(v, MAX_COEFF_THRESH) const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh); const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh); // Store. _mm_storeu_si128((__m128i*)&out[0], bin0); _mm_storeu_si128((__m128i*)&out[8], bin1); } // Convert coefficients to bin. for (k = 0; k < 16; ++k) { ++distribution[out[k]]; } } VP8LSetHistogramData(distribution, histo); }
static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred, int start_block, int end_block) { int histo[MAX_COEFF_THRESH + 1] = { 0 }; int16_t out[16]; int j, k; const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); for (j = start_block; j < end_block; ++j) { VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); // Convert coefficients to bin (within out[]). { // Load. const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]); const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]); // sign(out) = out >> 15 (0x0000 if positive, 0xffff if negative) const __m128i sign0 = _mm_srai_epi16(out0, 15); const __m128i sign1 = _mm_srai_epi16(out1, 15); // abs(out) = (out ^ sign) - sign const __m128i xor0 = _mm_xor_si128(out0, sign0); const __m128i xor1 = _mm_xor_si128(out1, sign1); const __m128i abs0 = _mm_sub_epi16(xor0, sign0); const __m128i abs1 = _mm_sub_epi16(xor1, sign1); // v = abs(out) >> 2 const __m128i v0 = _mm_srai_epi16(abs0, 2); const __m128i v1 = _mm_srai_epi16(abs1, 2); // bin = min(v, MAX_COEFF_THRESH) const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh); const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh); // Store. _mm_storeu_si128((__m128i*)&out[0], bin0); _mm_storeu_si128((__m128i*)&out[8], bin1); } // Use bin to update histogram. for (k = 0; k < 16; ++k) { histo[out[k]]++; } } return VP8GetAlpha(histo); }
static void TransformAC3(const int16_t* in, uint8_t* dst) { static const int kC1 = 20091 + (1 << 16); static const int kC2 = 35468; const __m128i A = _mm_set1_epi16(in[0] + 4); const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2)); const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1)); const int c1 = MUL(in[1], kC2); const int d1 = MUL(in[1], kC1); const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1); const __m128i B = _mm_adds_epi16(A, CD); const __m128i m0 = _mm_adds_epi16(B, d4); const __m128i m1 = _mm_adds_epi16(B, c4); const __m128i m2 = _mm_subs_epi16(B, c4); const __m128i m3 = _mm_subs_epi16(B, d4); const __m128i zero = _mm_setzero_si128(); // Load the source pixels. __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS)); __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS)); __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS)); __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS)); // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); dst1 = _mm_unpacklo_epi8(dst1, zero); dst2 = _mm_unpacklo_epi8(dst2, zero); dst3 = _mm_unpacklo_epi8(dst3, zero); // Add the inverse transform. dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3)); dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3)); dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3)); dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3)); // Unsigned saturate to 8b. dst0 = _mm_packus_epi16(dst0, dst0); dst1 = _mm_packus_epi16(dst1, dst1); dst2 = _mm_packus_epi16(dst2, dst2); dst3 = _mm_packus_epi16(dst3, dst3); // Store the results. *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0); *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1); *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2); *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3); }
static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) { __m128i v = _mm_loadl_epi64((__m128i const *)dst); const __m128i ones = _mm_set1_epi16(1); highbdRndingPacks(u); highbd_clip(u, 1, bd); v = _mm_add_epi16(v, u[0]); v = _mm_add_epi16(v, ones); v = _mm_srai_epi16(v, 1); _mm_storel_epi64((__m128i *)dst, v); }
static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) { __m128i v = _mm_loadl_epi64((__m128i const *)dst); const __m128i ones = _mm_set1_epi16(1); highbdRndingPacks(u); highbd_clip(u, 1, bd); v = _mm_add_epi16(v, u[0]); v = _mm_add_epi16(v, ones); v = _mm_srai_epi16(v, 1); *(uint32_t *)dst = _mm_cvtsi128_si32(v); }
static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width) { __m128i xmm0, xmmY1, xmmY2; __m128 xmmY; while (width >= 2) { xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); xmmY2 = _mm_adds_epi16(xmmY2, xmm0); xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 0x44); xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); rgb_buf += 8; width -= 2; } if (width) { xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); xmmY1 = _mm_srai_epi16(xmmY1, 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); } }
// These constants are 14b fixed-point version of ITU-R BT.601 constants. // R = (19077 * y + 26149 * v - 14234) >> 6 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 // B = (19077 * y + 33050 * u - 17685) >> 6 static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0, const __m128i* const U0, const __m128i* const V0, __m128i* const R, __m128i* const G, __m128i* const B) { const __m128i k19077 = _mm_set1_epi16(19077); const __m128i k26149 = _mm_set1_epi16(26149); const __m128i k14234 = _mm_set1_epi16(14234); // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic const __m128i k33050 = _mm_set1_epi16((short)33050); const __m128i k17685 = _mm_set1_epi16(17685); const __m128i k6419 = _mm_set1_epi16(6419); const __m128i k13320 = _mm_set1_epi16(13320); const __m128i k8708 = _mm_set1_epi16(8708); const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077); const __m128i R0 = _mm_mulhi_epu16(*V0, k26149); const __m128i R1 = _mm_sub_epi16(Y1, k14234); const __m128i R2 = _mm_add_epi16(R1, R0); const __m128i G0 = _mm_mulhi_epu16(*U0, k6419); const __m128i G1 = _mm_mulhi_epu16(*V0, k13320); const __m128i G2 = _mm_add_epi16(Y1, k8708); const __m128i G3 = _mm_add_epi16(G0, G1); const __m128i G4 = _mm_sub_epi16(G2, G3); // be careful with the saturated *unsigned* arithmetic here! const __m128i B0 = _mm_mulhi_epu16(*U0, k33050); const __m128i B1 = _mm_adds_epu16(B0, Y1); const __m128i B2 = _mm_subs_epu16(B1, k17685); // use logical shift for B2, which can be larger than 32767 *R = _mm_srai_epi16(R2, 6); // range: [-14234, 30815] *G = _mm_srai_epi16(G4, 6); // range: [-10953, 27710] *B = _mm_srli_epi16(B2, 6); // range: [0, 34238] }