template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int16_t(void *_idata,
                                                                       const int istride,
                                                                       const char *odata,
                                                                       const int ostride,
                                                                       const int iwidth,
                                                                       const int iheight,
                                                                       const int ooffset_x,
                                                                       const int ooffset_y,
                                                                       const int owidth,
                                                                       const int oheight) {
  int16_t *idata = (int16_t *)_idata;
  const int skip = 1;
  const __m128i ONE = _mm_set1_epi16(1);
  const __m128i OFFSET = _mm_set1_epi16(1 << (active_bits - 1));
  const __m128i SHUF = _mm_set_epi8(15,14, 11,10, 7,6, 3,2,
                                    13,12,   9,8, 5,4, 1,0);
  const __m128i CLIP = _mm_set1_epi16((1 << active_bits) - 1);
  const __m128i ZERO = _mm_set1_epi16(0);

  (void)iwidth;
  (void)iheight;

  for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) {
    for (int x = ooffset_x; x < ooffset_x + owidth; x += 16) {
      __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]);
      __m128i D8 = _mm_load_si128((__m128i *)&idata[y*istride + x + 8]);

      D0 = _mm_shuffle_epi8(D0, SHUF);
      D8 = _mm_shuffle_epi8(D8, SHUF);

      __m128i E0 = _mm_unpacklo_epi64(D0, D8);
      __m128i O1 = _mm_unpackhi_epi64(D0, D8);

      __m128i X0 = _mm_sub_epi16(E0, _mm_srai_epi16(_mm_add_epi16(O1, ONE), 1));
      __m128i X1 = _mm_add_epi16(O1, X0);

      __m128i Z0 = _mm_unpacklo_epi16(X0, X1);
      __m128i Z8 = _mm_unpackhi_epi16(X0, X1);

      if (shift != 0) {
        Z0 = _mm_add_epi16(Z0, ONE);
        Z8 = _mm_add_epi16(Z8, ONE);
        Z0 = _mm_srai_epi16(Z0, shift);
        Z8 = _mm_srai_epi16(Z8, shift);
      }

      Z0 = _mm_add_epi16(Z0, OFFSET);
      Z8 = _mm_add_epi16(Z8, OFFSET);

      Z0 = _mm_min_epi16(Z0, CLIP);
      Z8 = _mm_min_epi16(Z8, CLIP);

      Z0 = _mm_max_epi16(Z0, ZERO);
      Z8 = _mm_max_epi16(Z8, ZERO);

      _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 0 - ooffset_x)], Z0);
      _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 8 - ooffset_x)], Z8);
    }
  }
}
    SIMDValue SIMDInt8x16Operation::OpShiftRightByScalar(const SIMDValue& value, int8 count)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(value);
        X86SIMDValue x86tmp1;

        const _x86_SIMDValue X86_LOWBYTE_MASK  = { 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff };
        const _x86_SIMDValue X86_HIGHBYTE_MASK = { 0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00 };

        if (count < 0 || count > 8)
        {
            count = 8;
        }

        x86tmp1.m128i_value = _mm_slli_epi16(tmpaValue.m128i_value, 8);
        x86tmp1.m128i_value = _mm_srai_epi16(x86tmp1.m128i_value, count + 8);

        x86tmp1.m128i_value = _mm_and_si128(x86tmp1.m128i_value, X86_LOWBYTE_MASK.m128i_value);

        tmpaValue.m128i_value = _mm_srai_epi16(tmpaValue.m128i_value, count);
        tmpaValue.m128i_value = _mm_and_si128(tmpaValue.m128i_value, X86_HIGHBYTE_MASK.m128i_value);

        x86Result.m128i_value = _mm_or_si128(tmpaValue.m128i_value, x86tmp1.m128i_value);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src,
                          int src_stride, uint16_t *dst, int dst_stride,
                          int bd) {
  __m128i u[4], v[4];
  const __m128i ones = _mm_set1_epi16(1);

  transClipPixel(src, src_stride, u, bd);

  v[0] = _mm_loadl_epi64((__m128i const *)dst);
  v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
  v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
  v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));

  u[0] = _mm_add_epi16(u[0], v[0]);
  u[1] = _mm_add_epi16(u[1], v[1]);
  u[2] = _mm_add_epi16(u[2], v[2]);
  u[3] = _mm_add_epi16(u[3], v[3]);

  u[0] = _mm_add_epi16(u[0], ones);
  u[1] = _mm_add_epi16(u[1], ones);
  u[2] = _mm_add_epi16(u[2], ones);
  u[3] = _mm_add_epi16(u[3], ones);

  u[0] = _mm_srai_epi16(u[0], 1);
  u[1] = _mm_srai_epi16(u[1], 1);
  u[2] = _mm_srai_epi16(u[2], 1);
  u[3] = _mm_srai_epi16(u[3], 1);

  writePixel(u, width, pixelsNum, dst, dst_stride);
}
Beispiel #4
0
// Shift each byte of "x" by 3 bits while preserving by the sign bit.
static WEBP_INLINE void SignedShift8b(__m128i* const x) {
    const __m128i zero = _mm_setzero_si128();
    const __m128i signs = _mm_cmpgt_epi8(zero, *x);
    const __m128i lo_0 = _mm_unpacklo_epi8(*x, signs);  // s8 -> s16 sign extend
    const __m128i hi_0 = _mm_unpackhi_epi8(*x, signs);
    const __m128i lo_1 = _mm_srai_epi16(lo_0, 3);
    const __m128i hi_1 = _mm_srai_epi16(hi_0, 3);
    *x = _mm_packs_epi16(lo_1, hi_1);
}
Beispiel #5
0
static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
                                       ptrdiff_t src_stride, tran_low_t *coeff,
                                       int is_final) {
  // For high bitdepths, it is unnecessary to store_tran_low
  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
  // next stage.  Output to an intermediate buffer first, then store_tran_low()
  // in the final stage.
  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
  int16_t *t_coeff = temp_coeff;
  int16_t *coeff16 = (int16_t *)coeff;
  int idx;
  for (idx = 0; idx < 4; ++idx) {
    const int16_t *src_ptr =
        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
    hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
                      0);
  }

  for (idx = 0; idx < 64; idx += 8) {
    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));

    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);

    b0 = _mm_srai_epi16(b0, 1);
    b1 = _mm_srai_epi16(b1, 1);
    b2 = _mm_srai_epi16(b2, 1);
    b3 = _mm_srai_epi16(b3, 1);

    coeff0 = _mm_add_epi16(b0, b2);
    coeff1 = _mm_add_epi16(b1, b3);
    coeff2 = _mm_sub_epi16(b0, b2);
    coeff3 = _mm_sub_epi16(b1, b3);

    if (is_final) {
      store_tran_low(coeff0, coeff);
      store_tran_low(coeff1, coeff + 64);
      store_tran_low(coeff2, coeff + 128);
      store_tran_low(coeff3, coeff + 192);
      coeff += 8;
    } else {
      _mm_store_si128((__m128i *)coeff16, coeff0);
      _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
      _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
      _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
      coeff16 += 8;
    }

    t_coeff += 8;
  }
}
Beispiel #6
0
rfx_dwt_2d_encode_block_vert_sse2(INT16* src, INT16* l, INT16* h, int subband_width)
{
	int total_width;
	int x;
	int n;
	__m128i src_2n;
	__m128i src_2n_1;
	__m128i src_2n_2;
	__m128i h_n;
	__m128i h_n_m;
	__m128i l_n;

	total_width = subband_width << 1;

	for (n = 0; n < subband_width; n++)
	{
		for (x = 0; x < total_width; x += 8)
		{
			src_2n = _mm_load_si128((__m128i*) src);
			src_2n_1 = _mm_load_si128((__m128i*) (src + total_width));
			if (n < subband_width - 1)
				src_2n_2 = _mm_load_si128((__m128i*) (src + 2 * total_width));
			else
				src_2n_2 = src_2n;

			/* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */

			h_n = _mm_add_epi16(src_2n, src_2n_2);
			h_n = _mm_srai_epi16(h_n, 1);
			h_n = _mm_sub_epi16(src_2n_1, h_n);
			h_n = _mm_srai_epi16(h_n, 1);

			_mm_store_si128((__m128i*) h, h_n);

			if (n == 0)
				h_n_m = h_n;
			else
				h_n_m = _mm_load_si128((__m128i*) (h - total_width));

			/* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */

			l_n = _mm_add_epi16(h_n_m, h_n);
			l_n = _mm_srai_epi16(l_n, 1);
			l_n = _mm_add_epi16(l_n, src_2n);

			_mm_store_si128((__m128i*) l, l_n);

			src += 8;
			l += 8;
			h += 8;
		}
		src += total_width;
	}
}
static void aom_filter_block1d8_h4_ssse3(
    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i filtersReg;
  __m128i addFilterReg32, filt2Reg, filt3Reg;
  __m128i secondFilters, thirdFilters;
  __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
  __m128i srcReg32b1;
  unsigned int i;
  src_ptr -= 3;
  addFilterReg32 = _mm_set1_epi16(32);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  filtersReg = _mm_srai_epi16(filtersReg, 1);
  // converting the 16 bit (short) to 8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);

  // duplicate only the second 16 bits (third and forth byte)
  // across 256 bit register
  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
  // duplicate only the third 16 bits (fifth and sixth byte)
  // across 256 bit register
  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));

  filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
  filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));

  for (i = output_height; i > 0; i -= 1) {
    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);

    // filter the source buffer
    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);

    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);

    // shift by 6 bit each 16 bit
    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);

    // shrink to 8 bit each 16 bits
    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());

    src_ptr += src_pixels_per_line;

    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);

    output_ptr += output_pitch;
  }
}
Beispiel #8
0
// Updates values of 2 pixels at MB edge during complex filtering.
// Update operations:
// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
// Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
                                      const __m128i* const a0_lo,
                                      const __m128i* const a0_hi) {
    const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
    const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
    const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
    const __m128i sign_bit = _mm_set1_epi8(0x80);
    *pi = _mm_adds_epi8(*pi, delta);
    *qi = _mm_subs_epi8(*qi, delta);
    FLIP_SIGN_BIT2(*pi, *qi);
}
Beispiel #9
0
/* Compute branch metrics (gamma) */
void map_gen_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb) 
{
  __m128i res10, res20, res11, res21, res1, res2; 
  __m128i in, ap, pa, g1, g0;

  __m128i *inPtr  = (__m128i*) input;
  __m128i *appPtr = (__m128i*) app;
  __m128i *paPtr  = (__m128i*) parity;
  __m128i *resPtr = (__m128i*) h->branch;
  
  __m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
  __m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
  __m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
  __m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
  
  for (int i=0;i<long_cb/8;i++) {
    in = _mm_load_si128(inPtr);
    inPtr++;
    pa = _mm_load_si128(paPtr);
    paPtr++;
    
    if (appPtr) {
      ap = _mm_load_si128(appPtr);
      appPtr++;
      in = _mm_add_epi16(ap, in);
    }
    
    g1 = _mm_add_epi16(in, pa);
    g0 = _mm_sub_epi16(in, pa);

    g1 = _mm_srai_epi16(g1, 1);
    g0 = _mm_srai_epi16(g0, 1);
    
    res10 = _mm_shuffle_epi8(g0, res10_mask);
    res20 = _mm_shuffle_epi8(g0, res20_mask);
    res11 = _mm_shuffle_epi8(g1, res11_mask);
    res21 = _mm_shuffle_epi8(g1, res21_mask);

    res1  = _mm_or_si128(res10, res11);
    res2  = _mm_or_si128(res20, res21);

    _mm_store_si128(resPtr, res1);
    resPtr++;
    _mm_store_si128(resPtr, res2);    
    resPtr++;
  }

  for (int i=long_cb;i<long_cb+3;i++) {
    h->branch[2*i]   = (input[i] - parity[i])/2;
    h->branch[2*i+1] = (input[i] + parity[i])/2;
  }
}
Beispiel #10
0
rfx_dwt_2d_encode_block_horiz_sse2(INT16* src, INT16* l, INT16* h, int subband_width)
{
	int y;
	int n;
	int first;
	__m128i src_2n;
	__m128i src_2n_1;
	__m128i src_2n_2;
	__m128i h_n;
	__m128i h_n_m;
	__m128i l_n;

	for (y = 0; y < subband_width; y++)
	{
		for (n = 0; n < subband_width; n += 8)
		{
			/* The following 3 Set operations consumes more than half of the total DWT processing time! */
			src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
			src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
			src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[14] : src[16],
				src[14], src[12], src[10], src[8], src[6], src[4], src[2]);

			/* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */

			h_n = _mm_add_epi16(src_2n, src_2n_2);
			h_n = _mm_srai_epi16(h_n, 1);
			h_n = _mm_sub_epi16(src_2n_1, h_n);
			h_n = _mm_srai_epi16(h_n, 1);

			_mm_store_si128((__m128i*) h, h_n);

			h_n_m = _mm_loadu_si128((__m128i*) (h - 1));
			if (n == 0)
			{
				first = _mm_extract_epi16(h_n_m, 1);
				h_n_m = _mm_insert_epi16(h_n_m, first, 0);
			}

			/* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */

			l_n = _mm_add_epi16(h_n_m, h_n);
			l_n = _mm_srai_epi16(l_n, 1);
			l_n = _mm_add_epi16(l_n, src_2n);

			_mm_store_si128((__m128i*) l, l_n);

			src += 16;
			l += 8;
			h += 8;
		}
	}
}
Beispiel #11
0
    static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
                                        const uint8* u_buf,
                                        const uint8* v_buf,
                                        uint8* rgb_buf,
                                        int width,
                                        int source_dx) {
        __m128i xmm0, xmmY1, xmmY2;
        __m128  xmmY;
        uint8 u, v, y;
        int x = 0;

        while (width >= 2) {
            u = u_buf[x >> 17];
            v = v_buf[x >> 17];
            y = y_buf[x >> 16];
            x += source_dx;

            xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
                                  _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
            xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
            xmmY1 = _mm_adds_epi16(xmmY1, xmm0);

            y = y_buf[x >> 16];
            x += source_dx;

            xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
            xmmY2 = _mm_adds_epi16(xmmY2, xmm0);

            xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
                                  0x44);
            xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
            xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);

            _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
            rgb_buf += 8;
            width -= 2;
        }

        if (width) {
            u = u_buf[x >> 17];
            v = v_buf[x >> 17];
            y = y_buf[x >> 16];

            xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
                                  _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
            xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
            xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
            xmmY1 = _mm_srai_epi16(xmmY1, 6);
            xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
            *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
        }
    }
Beispiel #12
0
// Compute the sum of all pixel differences of this MB.
static INLINE int sum_diff_16x1(__m128i acc_diff) {
  const __m128i k_1 = _mm_set1_epi16(1);
  const __m128i acc_diff_lo =
      _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
  const __m128i acc_diff_hi =
      _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
  const __m128i hgfe_dcba =
      _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
  const __m128i hgfedcba =
      _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
  return _mm_cvtsi128_si32(hgfedcba);
}
Beispiel #13
0
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
               uint8_t *dstp, const uint8_t *srcp, int th)
{
    uint8_t *p0 = buff + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *orig = p0, *end = p2;

    line_copy8(p0, srcp + stride, width, 1);
    line_copy8(p1, srcp, width, 1);

    uint8_t threshold = (uint8_t)th;

    __m128i zero = _mm_setzero_si128();
    __m128i xth = _mm_set1_epi8((int8_t)threshold);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 1 ? 1 : -1);
        line_copy8(p2, srcp, width, 1);
        uint8_t *coordinates[] = COORDINATES;
        for (int x = 0; x < width; x += 16) {
            __m128i sumlo = zero;
            __m128i sumhi = zero;

            for (int i = 0; i < 8; i++) {
                __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x));
                sumlo  = _mm_add_epi16(sumlo, _mm_unpacklo_epi8(target, zero));
                sumhi  = _mm_add_epi16(sumhi, _mm_unpackhi_epi8(target, zero));
            }

            sumlo = _mm_srai_epi16(sumlo, 3);
            sumhi = _mm_srai_epi16(sumhi, 3);
            sumlo = _mm_packus_epi16(sumlo, sumhi);

            __m128i src = _mm_load_si128((__m128i *)(p1 + x));
            __m128i limit = _mm_adds_epu8(src, xth);

            sumlo = _mm_max_epu8(sumlo, src);
            sumlo = _mm_min_epu8(sumlo, limit);

            _mm_store_si128((__m128i *)(dstp + x), sumlo);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = (p2 == end) ? orig : p2 + bstride;
    }
}
Beispiel #14
0
// Predictors13: ClampedAddSubtractHalf
static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
  int i;
  const __m128i zero = _mm_setzero_si128();
  for (i = 0; i + 2 <= num_pixels; i += 2) {
    // we can only process two pixels at a time
    const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]);
    const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]);
    const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]);
    const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]);
    const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
    const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
    const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
    const __m128i sum = _mm_add_epi16(T_lo, L_lo);
    const __m128i avg = _mm_srli_epi16(sum, 1);
    const __m128i A1 = _mm_sub_epi16(avg, TL_lo);
    const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg);
    const __m128i A2 = _mm_sub_epi16(A1, bit_fix);
    const __m128i A3 = _mm_srai_epi16(A2, 1);
    const __m128i A4 = _mm_add_epi16(avg, A3);
    const __m128i pred = _mm_packus_epi16(A4, A4);
    const __m128i res = _mm_sub_epi8(src, pred);
    _mm_storel_epi64((__m128i*)&out[i], res);
  }
  if (i != num_pixels) {
    VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i);
  }
}
Beispiel #15
0
void srslte_vec_sc_div2_sss_simd(short *x, int k, short *z, uint32_t len)
{
#ifdef LV_HAVE_SSE
  unsigned int number = 0;
  const unsigned int points = len / 8;

  const __m128i* xPtr = (const __m128i*) x;
  __m128i* zPtr = (__m128i*) z;

  __m128i xVal, zVal;
  for(;number < points; number++){

    xVal = _mm_load_si128(xPtr);
    
    zVal = _mm_srai_epi16(xVal, k);                 
      
    _mm_store_si128(zPtr, zVal); 

    xPtr ++;
    zPtr ++;
  }

  number = points * 8;
  short divn = (1<<k);
  for(;number < len; number++){
    z[number] = x[number] / divn;
  }
#endif
}
Beispiel #16
0
void ff_hevc_transform_skip_8_sse(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride)
{
    uint8_t *dst = (uint8_t*)_dst;
    ptrdiff_t stride = _stride;
    int shift = 5;
    int offset = 16;
    __m128i r0, r1, r2, r3, r4, r5, r6, r9;

    r9 = _mm_setzero_si128();
    r2 = _mm_set1_epi16(offset);

    r0 = _mm_load_si128((__m128i*)(coeffs));
    r1 = _mm_load_si128((__m128i*)(coeffs + 8));


    r0 = _mm_adds_epi16(r0, r2);
    r1 = _mm_adds_epi16(r1, r2);

    r0 = _mm_srai_epi16(r0, shift);
    r1 = _mm_srai_epi16(r1, shift);

    r3 = _mm_loadl_epi64((__m128i*)(dst));
    r4 = _mm_loadl_epi64((__m128i*)(dst + stride));
    r5 = _mm_loadl_epi64((__m128i*)(dst + 2 * stride));
    r6 = _mm_loadl_epi64((__m128i*)(dst + 3 * stride));

    r3 = _mm_unpacklo_epi8(r3, r9);
    r4 = _mm_unpacklo_epi8(r4, r9);
    r5 = _mm_unpacklo_epi8(r5, r9);
    r6 = _mm_unpacklo_epi8(r6, r9);
    r3 = _mm_unpacklo_epi64(r3, r4);
    r4 = _mm_unpacklo_epi64(r5, r6);


    r3 = _mm_adds_epi16(r3, r0);
    r4 = _mm_adds_epi16(r4, r1);

    r3 = _mm_packus_epi16(r3, r4);

    *((uint32_t *)(dst)) = _mm_cvtsi128_si32(r3);
    dst+=stride;
    *((uint32_t *)(dst)) = _mm_cvtsi128_si32(_mm_srli_si128(r3, 4));
    dst+=stride;
    *((uint32_t *)(dst)) = _mm_cvtsi128_si32(_mm_srli_si128(r3, 8));
    dst+=stride;
    *((uint32_t *)(dst)) = _mm_cvtsi128_si32(_mm_srli_si128(r3, 12));
}
Beispiel #17
0
__m128i test_mm_srai_epi16(__m128i A) {
  // DAG-LABEL: test_mm_srai_epi16
  // DAG: call <8 x i16> @llvm.x86.sse2.psrai.w
  //
  // ASM-LABEL: test_mm_srai_epi16
  // ASM: psraw
  return _mm_srai_epi16(A, 1);
}
Beispiel #18
0
void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
                             tran_low_t *coeff) {
  // For high bitdepths, it is unnecessary to store_tran_low
  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
  // next stage.  Output to an intermediate buffer first, then store_tran_low()
  // in the final stage.
  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
  int16_t *t_coeff = temp_coeff;
  int idx;
  for (idx = 0; idx < 4; ++idx) {
    const int16_t *src_ptr =
        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
    hadamard_16x16_sse2(src_ptr, src_stride,
                        (tran_low_t *)(t_coeff + idx * 256), 0);
  }

  for (idx = 0; idx < 256; idx += 8) {
    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));

    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);

    b0 = _mm_srai_epi16(b0, 2);
    b1 = _mm_srai_epi16(b1, 2);
    b2 = _mm_srai_epi16(b2, 2);
    b3 = _mm_srai_epi16(b3, 2);

    coeff0 = _mm_add_epi16(b0, b2);
    coeff1 = _mm_add_epi16(b1, b3);
    store_tran_low(coeff0, coeff);
    store_tran_low(coeff1, coeff + 256);

    coeff2 = _mm_sub_epi16(b0, b2);
    coeff3 = _mm_sub_epi16(b1, b3);
    store_tran_low(coeff2, coeff + 512);
    store_tran_low(coeff3, coeff + 768);

    coeff += 8;
    t_coeff += 8;
  }
}
Beispiel #19
0
void Convert444to420(LPBYTE input, int width, int pitch, int height, int startY, int endY, LPBYTE *output, bool bSSE2Available)
{
    LPBYTE lumPlane     = output[0];
    LPBYTE uPlane       = output[1];
    LPBYTE vPlane       = output[2];
    int  chrPitch       = width>>1;

    if(bSSE2Available)
    {
        __m128i lumMask = _mm_set1_epi32(0x0000FF00);
        __m128i uvMask = _mm_set1_epi16(0x00FF);

        for(int y=startY; y<endY; y+=2)
        {
            int yPos    = y*pitch;
            int chrYPos = ((y>>1)*chrPitch);
            int lumYPos = y*width;

            for(int x=0; x<width; x+=4)
            {
                LPBYTE lpImagePos = input+yPos+(x*4);
                int chrPos  = chrYPos + (x>>1);
                int lumPos0 = lumYPos + x;
                int lumPos1 = lumPos0+width;

                __m128i line1 = _mm_load_si128((__m128i*)lpImagePos);
                __m128i line2 = _mm_load_si128((__m128i*)(lpImagePos+pitch));

                //pack lum vals
                {
                    __m128i packVal = _mm_packs_epi32(_mm_srli_si128(_mm_and_si128(line1, lumMask), 1), _mm_srli_si128(_mm_and_si128(line2, lumMask), 1));
                    packVal = _mm_packus_epi16(packVal, packVal);

                    *(LPUINT)(lumPlane+lumPos0) = packVal.m128i_u32[0];
                    *(LPUINT)(lumPlane+lumPos1) = packVal.m128i_u32[1];
                }

                //do average, pack UV vals
                {
                    __m128i addVal = _mm_add_epi64(_mm_and_si128(line1, uvMask), _mm_and_si128(line2, uvMask));
                    __m128i avgVal = _mm_srai_epi16(_mm_add_epi64(addVal, _mm_shuffle_epi32(addVal, _MM_SHUFFLE(2, 3, 0, 1))), 2);
                    avgVal = _mm_shuffle_epi32(avgVal, _MM_SHUFFLE(3, 1, 2, 0));
                    avgVal = _mm_shufflelo_epi16(avgVal, _MM_SHUFFLE(3, 1, 2, 0));
                    avgVal = _mm_packus_epi16(avgVal, avgVal);

                    DWORD packedVals = avgVal.m128i_u32[0];

                    *(LPWORD)(uPlane+chrPos) = WORD(packedVals);
                    *(LPWORD)(vPlane+chrPos) = WORD(packedVals>>16);
                }
            }
        }
    }
    else
    {
#ifdef _WIN64
        for(int y=startY; y<endY; y+=2)
static void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
                                        ptrdiff_t dst_stride, uint32_t height,
                                        const int16_t *kernel) {
  __m128i kernel_reg;                         // Kernel
  __m128i kernel_reg_23, kernel_reg_45;       // Segments of the kernel used
  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
  int h;

  __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
  __m128i dst_first;
  __m128i even, odd;

  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
  src_ptr -= 1;

  // Load Kernel
  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);

  for (h = height; h > 0; --h) {
    // We will load multiple shifted versions of the row and shuffle them into
    // 16-bit words of the form
    // ... s[2] s[1] s[0] s[-1]
    // ... s[4] s[3] s[2] s[1]
    // Then we call multiply and add to get partial results
    // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
    // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
    // The two results are then added together to get the even output
    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
    src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
    src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
    src_reg_shift_3 = _mm_srli_si128(src_reg, 3);

    // Output 6 4 2 0
    even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
                                 &kernel_reg_45);

    // Output 7 5 3 1
    odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
                                &kernel_reg_23, &kernel_reg_45);

    // Combine to get the first half of the dst
    dst_first = mm_zip_epi32_sse2(&even, &odd);
    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);

    // Saturate and convert to 8-bit words
    dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());

    _mm_storel_epi64((__m128i *)dst_ptr, dst_first);

    src_ptr += src_stride;
    dst_ptr += dst_stride;
  }
}
    SIMDValue SIMDInt16x8Operation::OpShiftRightByScalar(const SIMDValue& value, int count)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpValue = X86SIMDValue::ToX86SIMDValue(value);
        // Shifts the 8 signed 16-bit integers right by count bits while shifting in the sign bit
        x86Result.m128i_value = _mm_srai_epi16(tmpValue.m128i_value, count);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
static void aom_filter_block1d4_h4_ssse3(
    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i filtersReg;
  __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
  unsigned int i;
  src_ptr -= 3;
  addFilterReg32 = _mm_set1_epi16(32);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  filtersReg = _mm_srai_epi16(filtersReg, 1);
  // converting the 16 bit (short) to 8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);

  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
  filt1Reg = _mm_load_si128((__m128i const *)(filtd4));

  for (i = output_height; i > 0; i -= 1) {
    // load the 2 strides of source
    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);

    // filter the source buffer
    srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg);

    // multiply 4 adjacent elements with the filter and add the result
    srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters);

    srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128());

    // shift by 6 bit each 16 bit
    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve result
    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());

    src_ptr += src_pixels_per_line;

    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
    output_ptr += output_pitch;
  }
}
Beispiel #23
0
__m64 _m_psrawi(__m64 _M, int _Count)
{
    __m128i lhs = {0};
    lhs.m128i_i64[0] = _M.m64_i64;

    lhs = _mm_srai_epi16(lhs, _Count);

    _M.m64_i64 = lhs.m128i_i64[0];
    return _M;
}
Beispiel #24
0
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
                             int start_block, int end_block,
                             VP8Histogram* const histo) {
  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  for (j = start_block; j < end_block; ++j) {
    int16_t out[16];
    int k;

    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);

    // Convert coefficients to bin (within out[]).
    {
      // Load.
      const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
      const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
      // sign(out) = out >> 15  (0x0000 if positive, 0xffff if negative)
      const __m128i sign0 = _mm_srai_epi16(out0, 15);
      const __m128i sign1 = _mm_srai_epi16(out1, 15);
      // abs(out) = (out ^ sign) - sign
      const __m128i xor0 = _mm_xor_si128(out0, sign0);
      const __m128i xor1 = _mm_xor_si128(out1, sign1);
      const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
      const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
      // v = abs(out) >> 3
      const __m128i v0 = _mm_srai_epi16(abs0, 3);
      const __m128i v1 = _mm_srai_epi16(abs1, 3);
      // bin = min(v, MAX_COEFF_THRESH)
      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
      // Store.
      _mm_storeu_si128((__m128i*)&out[0], bin0);
      _mm_storeu_si128((__m128i*)&out[8], bin1);
    }

    // Convert coefficients to bin.
    for (k = 0; k < 16; ++k) {
      ++distribution[out[k]];
    }
  }
  VP8LSetHistogramData(distribution, histo);
}
Beispiel #25
0
static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
                                int start_block, int end_block) {
  int histo[MAX_COEFF_THRESH + 1] = { 0 };
  int16_t out[16];
  int j, k;
  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
  for (j = start_block; j < end_block; ++j) {
    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);

    // Convert coefficients to bin (within out[]).
    {
      // Load.
      const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
      const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
      // sign(out) = out >> 15  (0x0000 if positive, 0xffff if negative)
      const __m128i sign0 = _mm_srai_epi16(out0, 15);
      const __m128i sign1 = _mm_srai_epi16(out1, 15);
      // abs(out) = (out ^ sign) - sign
      const __m128i xor0 = _mm_xor_si128(out0, sign0);
      const __m128i xor1 = _mm_xor_si128(out1, sign1);
      const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
      const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
      // v = abs(out) >> 2
      const __m128i v0 = _mm_srai_epi16(abs0, 2);
      const __m128i v1 = _mm_srai_epi16(abs1, 2);
      // bin = min(v, MAX_COEFF_THRESH)
      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
      // Store.
      _mm_storeu_si128((__m128i*)&out[0], bin0);
      _mm_storeu_si128((__m128i*)&out[8], bin1);
    }

    // Use bin to update histogram.
    for (k = 0; k < 16; ++k) {
      histo[out[k]]++;
    }
  }

  return VP8GetAlpha(histo);
}
Beispiel #26
0
static void TransformAC3(const int16_t* in, uint8_t* dst) {
    static const int kC1 = 20091 + (1 << 16);
    static const int kC2 = 35468;
    const __m128i A = _mm_set1_epi16(in[0] + 4);
    const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
    const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
    const int c1 = MUL(in[1], kC2);
    const int d1 = MUL(in[1], kC1);
    const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
    const __m128i B = _mm_adds_epi16(A, CD);
    const __m128i m0 = _mm_adds_epi16(B, d4);
    const __m128i m1 = _mm_adds_epi16(B, c4);
    const __m128i m2 = _mm_subs_epi16(B, c4);
    const __m128i m3 = _mm_subs_epi16(B, d4);
    const __m128i zero = _mm_setzero_si128();
    // Load the source pixels.
    __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
    __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
    __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
    __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
    // Convert to 16b.
    dst0 = _mm_unpacklo_epi8(dst0, zero);
    dst1 = _mm_unpacklo_epi8(dst1, zero);
    dst2 = _mm_unpacklo_epi8(dst2, zero);
    dst3 = _mm_unpacklo_epi8(dst3, zero);
    // Add the inverse transform.
    dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));
    dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));
    dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));
    dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));
    // Unsigned saturate to 8b.
    dst0 = _mm_packus_epi16(dst0, dst0);
    dst1 = _mm_packus_epi16(dst1, dst1);
    dst2 = _mm_packus_epi16(dst2, dst2);
    dst3 = _mm_packus_epi16(dst3, dst3);
    // Store the results.
    *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
    *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
    *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
    *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
}
static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
  const __m128i ones = _mm_set1_epi16(1);

  highbdRndingPacks(u);
  highbd_clip(u, 1, bd);

  v = _mm_add_epi16(v, u[0]);
  v = _mm_add_epi16(v, ones);
  v = _mm_srai_epi16(v, 1);
  _mm_storel_epi64((__m128i *)dst, v);
}
static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
  const __m128i ones = _mm_set1_epi16(1);

  highbdRndingPacks(u);
  highbd_clip(u, 1, bd);

  v = _mm_add_epi16(v, u[0]);
  v = _mm_add_epi16(v, ones);
  v = _mm_srai_epi16(v, 1);
  *(uint32_t *)dst = _mm_cvtsi128_si32(v);
}
Beispiel #29
0
    static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf,
            const uint8* u_buf,
            const uint8* v_buf,
            uint8* rgb_buf,
            int width) {
        __m128i xmm0, xmmY1, xmmY2;
        __m128  xmmY;

        while (width >= 2) {
            xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
                                  _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));

            xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
            xmmY1 = _mm_adds_epi16(xmmY1, xmm0);

            xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
            xmmY2 = _mm_adds_epi16(xmmY2, xmm0);

            xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
                                  0x44);
            xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
            xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);

            _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
            rgb_buf += 8;
            width -= 2;
        }

        if (width) {
            xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
                                  _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
            xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf));
            xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
            xmmY1 = _mm_srai_epi16(xmmY1, 6);
            xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
            *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
        }
    }
Beispiel #30
0
// These constants are 14b fixed-point version of ITU-R BT.601 constants.
// R = (19077 * y             + 26149 * v - 14234) >> 6
// G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
// B = (19077 * y + 33050 * u             - 17685) >> 6
static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0,
                                     const __m128i* const U0,
                                     const __m128i* const V0,
                                     __m128i* const R,
                                     __m128i* const G,
                                     __m128i* const B) {
  const __m128i k19077 = _mm_set1_epi16(19077);
  const __m128i k26149 = _mm_set1_epi16(26149);
  const __m128i k14234 = _mm_set1_epi16(14234);
  // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
  const __m128i k33050 = _mm_set1_epi16((short)33050);
  const __m128i k17685 = _mm_set1_epi16(17685);
  const __m128i k6419  = _mm_set1_epi16(6419);
  const __m128i k13320 = _mm_set1_epi16(13320);
  const __m128i k8708  = _mm_set1_epi16(8708);

  const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);

  const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
  const __m128i R1 = _mm_sub_epi16(Y1, k14234);
  const __m128i R2 = _mm_add_epi16(R1, R0);

  const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
  const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
  const __m128i G2 = _mm_add_epi16(Y1, k8708);
  const __m128i G3 = _mm_add_epi16(G0, G1);
  const __m128i G4 = _mm_sub_epi16(G2, G3);

  // be careful with the saturated *unsigned* arithmetic here!
  const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
  const __m128i B1 = _mm_adds_epu16(B0, Y1);
  const __m128i B2 = _mm_subs_epu16(B1, k17685);

  // use logical shift for B2, which can be larger than 32767
  *R = _mm_srai_epi16(R2, 6);   // range: [-14234, 30815]
  *G = _mm_srai_epi16(G4, 6);   // range: [-10953, 27710]
  *B = _mm_srli_epi16(B2, 6);   // range: [0, 34238]
}