Esempio n. 1
1
static inline void
desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
{
	__m128i ptype0, ptype1, vtag0, vtag1;
	union {
		uint16_t e[4];
		uint64_t dword;
	} vol;

	ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]);
	ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]);
	vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]);
	vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]);

	ptype1 = _mm_unpacklo_epi32(ptype0, ptype1);
	vtag1 = _mm_unpacklo_epi32(vtag0, vtag1);

	ptype1 = _mm_slli_epi16(ptype1, PTYPE_SHIFT);
	vtag1 = _mm_srli_epi16(vtag1, VTAG_SHIFT);

	ptype1 = _mm_or_si128(ptype1, vtag1);
	vol.dword = _mm_cvtsi128_si64(ptype1) & OLFLAGS_MASK_V;

	rx_pkts[0]->ol_flags = vol.e[0];
	rx_pkts[1]->ol_flags = vol.e[1];
	rx_pkts[2]->ol_flags = vol.e[2];
	rx_pkts[3]->ol_flags = vol.e[3];
}
void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
  const __m128i A0 = _mm_load_si128((const __m128i *)left);
  const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
  const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff);
  const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
  const __m128i B1 = _mm_alignr_epi8(LR, A1, 2);
  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
  const __m128i C1 = _mm_alignr_epi8(LR, A1, 4);
  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
  const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
  const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
  const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
  const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
  (void)above;
  (void)bd;
  d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c);
  d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d);
  d207_store_4x16(&dst, stride, &out_c, &out_d, &LR);
  d207_store_4x16(&dst, stride, &out_d, &LR, &LR);
}
	void ConvertInt8ToFloat(__m128i in_input,
		__m128 *out_output)
	{
		__m128i input_16, input_32;

		// convert first half to 16bit integer
		input_16 = _mm_unpacklo_epi8(in_input, ZERO);
		// convert first fourth to 32bit interger
		input_32 = _mm_unpacklo_epi16(input_16, ZERO);
		// convert first fourth to 32bit floating point value
		out_output[0] = _mm_cvtepi32_ps(input_32);

		// convert second fourth to 32bit integer
		input_32 = _mm_unpackhi_epi16(input_16, ZERO);
		// convert second fourth to 32bit floating point value
		out_output[1] = _mm_cvtepi32_ps(input_32);

		// convert second half to 16bit integer
		input_16 = _mm_unpackhi_epi8(in_input, ZERO);
		// convert third fourth to 32bit interger
		input_32 = _mm_unpacklo_epi16(input_16, ZERO);
		// convert third fourth to 32bit floating point value
		out_output[2] = _mm_cvtepi32_ps(input_32);

		// convert fourth fourth to 32bit integer
		input_32 = _mm_unpackhi_epi16(input_16, ZERO);
		// convert fourth fourth to 32bit floating point value
		out_output[3] = _mm_cvtepi32_ps(input_32);
	}
Esempio n. 4
0
inline FORCE_INLINE void transpose8_epi16(__m128i &x0, __m128i &x1, __m128i &x2, __m128i &x3, __m128i &x4, __m128i &x5, __m128i &x6, __m128i &x7)
{
	__m128i t0, t1, t2, t3, t4, t5, t6, t7;
	__m128i tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7;

	t0 = _mm_unpacklo_epi16(x0, x1);
	t1 = _mm_unpacklo_epi16(x2, x3);
	t2 = _mm_unpacklo_epi16(x4, x5);
	t3 = _mm_unpacklo_epi16(x6, x7);
	t4 = _mm_unpackhi_epi16(x0, x1);
	t5 = _mm_unpackhi_epi16(x2, x3);
	t6 = _mm_unpackhi_epi16(x4, x5);
	t7 = _mm_unpackhi_epi16(x6, x7);

	tt0 = _mm_unpacklo_epi32(t0, t1);
	tt1 = _mm_unpackhi_epi32(t0, t1);
	tt2 = _mm_unpacklo_epi32(t2, t3);
	tt3 = _mm_unpackhi_epi32(t2, t3);
	tt4 = _mm_unpacklo_epi32(t4, t5);
	tt5 = _mm_unpackhi_epi32(t4, t5);
	tt6 = _mm_unpacklo_epi32(t6, t7);
	tt7 = _mm_unpackhi_epi32(t6, t7);

	x0 = _mm_unpacklo_epi64(tt0, tt2);
	x1 = _mm_unpackhi_epi64(tt0, tt2);
	x2 = _mm_unpacklo_epi64(tt1, tt3);
	x3 = _mm_unpackhi_epi64(tt1, tt3);
	x4 = _mm_unpacklo_epi64(tt4, tt6);
	x5 = _mm_unpackhi_epi64(tt4, tt6);
	x6 = _mm_unpacklo_epi64(tt5, tt7);
	x7 = _mm_unpackhi_epi64(tt5, tt7);
}
Esempio n. 5
0
static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
                                 int width, int height,
                                 uint32_t* dst, int dst_stride) {
  int i, j;
  const __m128i zero = _mm_setzero_si128();
  const int limit = width & ~15;
  for (j = 0; j < height; ++j) {
    for (i = 0; i < limit; i += 16) {   // process 16 alpha bytes
      const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]);
      const __m128i a1 = _mm_unpacklo_epi8(zero, a0);  // note the 'zero' first!
      const __m128i b1 = _mm_unpackhi_epi8(zero, a0);
      const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
      const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero);
      const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
      const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero);
      _mm_storeu_si128((__m128i*)&dst[i +  0], a2_lo);
      _mm_storeu_si128((__m128i*)&dst[i +  4], a2_hi);
      _mm_storeu_si128((__m128i*)&dst[i +  8], b2_lo);
      _mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi);
    }
    for (; i < width; ++i) dst[i] = alpha[i] << 8;
    alpha += alpha_stride;
    dst += dst_stride;
  }
}
static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
  __m128i v0, v1;
  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));

  u[0] = _mm_loadu_si128((__m128i const *)src);
  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));

  u[0] = _mm_add_epi32(u[0], rnd);
  u[1] = _mm_add_epi32(u[1], rnd);
  u[2] = _mm_add_epi32(u[2], rnd);
  u[3] = _mm_add_epi32(u[3], rnd);

  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
  u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
  u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
  u[3] = _mm_srai_epi32(u[3], FILTER_BITS);

  u[0] = _mm_packus_epi32(u[0], u[1]);
  u[1] = _mm_packus_epi32(u[2], u[3]);

  highbd_clip(u, 2, bd);

  v0 = _mm_unpacklo_epi16(u[0], u[1]);
  v1 = _mm_unpackhi_epi16(u[0], u[1]);

  u[0] = _mm_unpacklo_epi16(v0, v1);
  u[2] = _mm_unpackhi_epi16(v0, v1);

  u[1] = _mm_srli_si128(u[0], 8);
  u[3] = _mm_srli_si128(u[2], 8);
}
static void GF_FUNC_ALIGN VS_CC
convert_to_float_8bit(int radius, float *kernel, const uint8_t *srcp,
                      float *buff, float *dstp, int width, int height,
                      int src_stride, int dst_stride)
{
    __m128i zero = _mm_setzero_si128();
    
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x += 16) {
            __m128i xmm0 = _mm_load_si128((__m128i *)(srcp + x));
            __m128i xmm1 = _mm_unpackhi_epi8(xmm0, zero);
            xmm0 = _mm_unpacklo_epi8(xmm0, zero);
            __m128 f0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero));
            __m128 f1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero));
            __m128 f2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm1, zero));
            __m128 f3 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm1, zero));
            
            _mm_store_ps(dstp + x , f0);
            _mm_store_ps(dstp + x + 4, f1);
            _mm_store_ps(dstp + x + 8, f2);
            _mm_store_ps(dstp + x + 12, f3);
        }
        srcp += src_stride;
        dstp += dst_stride;
    }
}
void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
  const __m128i B1 = _mm_srli_si128(A1, 2);
  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
  const __m128i C1 = _mm_srli_si128(A1, 4);
  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
  const __m128i L0 = _mm_load_si128((const __m128i *)left);
  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
  __m128i row_0 = avg3_0;
  __m128i row_1 = avg3_1;
  __m128i avg2_avg3_left[2][2];
  int i, j;
  (void)bd;

  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);

  for (j = 0; j < 2; ++j) {
    for (i = 0; i < 2; ++i) {
      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
      _mm_store_si128((__m128i *)dst, row_0);
      _mm_store_si128((__m128i *)(dst + 8), row_1);
      dst += stride;
      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
      _mm_store_si128((__m128i *)dst, row_0);
      _mm_store_si128((__m128i *)(dst + 8), row_1);
      dst += stride;
      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
      _mm_store_si128((__m128i *)dst, row_0);
      _mm_store_si128((__m128i *)(dst + 8), row_1);
      dst += stride;
      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
      _mm_store_si128((__m128i *)dst, row_0);
      _mm_store_si128((__m128i *)(dst + 8), row_1);
      dst += stride;
    }
  }
}
Esempio n. 9
0
void __stdcall
rgb32_to_planar_shader_3_f16c(uint8_t** dstp, const uint8_t** srcp, const int dpitch,
    const int spitch, const int width, const int height, void* buff) noexcept
{
    const uint8_t* s = srcp[0] + (height - 1) * spitch;
    // map r to Y, g to U, b to V
    uint8_t* dr = dstp[0];
    uint8_t* dg = dstp[1];
    uint8_t* db = dstp[2];

    float* bb = reinterpret_cast<float*>(buff);
    float* bg = bb + ((width + 7) & ~7); // must be aligned 32 bytes
    float* br = bg + ((width + 7) & ~7); // must be aligned 32 bytes

    const __m128 rcp = _mm_set1_ps(1.0f / 255);
    const __m128i zero = _mm_setzero_si128();

    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; x += 8) {
            __m128i s0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(s + 4 * x + 0));
            __m128i s1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(s + 4 * x + 8));
            __m128i s2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(s + 4 * x + 16));
            __m128i s3 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(s + 4 * x + 24));
            s0 = _mm_unpacklo_epi8(s0, s1);
            s1 = _mm_unpacklo_epi8(s2, s3);
            s2 = _mm_unpacklo_epi16(s0, s1);
            s3 = _mm_unpackhi_epi16(s0, s1);
            s0 = _mm_unpacklo_epi8(s2, s3);
            s1 = _mm_unpackhi_epi8(s2, s3);

            __m128i r = _mm_unpacklo_epi8(s1, zero);
            s3 = _mm_unpacklo_epi16(r, zero);
            _mm_store_ps(br + x + 0, _mm_mul_ps(rcp, _mm_cvtepi32_ps(s3)));
            s3 = _mm_unpackhi_epi16(r, zero);
            _mm_store_ps(br + x + 4, _mm_mul_ps(rcp, _mm_cvtepi32_ps(s3)));

            __m128i b = _mm_unpacklo_epi8(s0, zero);
            __m128i g = _mm_unpackhi_epi8(s0, zero);

            s3 = _mm_unpacklo_epi16(b, zero);
            _mm_store_ps(bb + x + 0, _mm_mul_ps(rcp, _mm_cvtepi32_ps(s3)));
            s3 = _mm_unpackhi_epi16(b, zero);
            _mm_store_ps(bb + x + 4, _mm_mul_ps(rcp, _mm_cvtepi32_ps(s3)));

            s3 = _mm_unpacklo_epi16(g, zero);
            _mm_store_ps(bg + x + 0, _mm_mul_ps(rcp, _mm_cvtepi32_ps(s3)));
            s3 = _mm_unpackhi_epi16(g, zero);
            _mm_store_ps(bg + x + 4, _mm_mul_ps(rcp, _mm_cvtepi32_ps(s3)));
        }
        convert_float_to_half(dr, br, width);
        convert_float_to_half(dg, bg, width);
        convert_float_to_half(db, bb, width);
        s -= spitch;
        dr += dpitch;
        dg += dpitch;
        db += dpitch;
    }
}
Esempio n. 10
0
static INLINE unsigned int highbd_masked_sad4xh_ssse3(
    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
    int height) {
  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
  int y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
  const __m128i round_const =
      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
  const __m128i one = _mm_set1_epi16(1);

  for (y = 0; y < height; y += 2) {
    const __m128i src = _mm_unpacklo_epi64(
        _mm_loadl_epi64((const __m128i *)src_ptr),
        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
    const __m128i a =
        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr),
                           _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]));
    const __m128i b =
        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr),
                           _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
    // Zero-extend mask to 16 bits
    const __m128i m = _mm_unpacklo_epi8(
        _mm_unpacklo_epi32(
            _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
            _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
        _mm_setzero_si128());
    const __m128i m_inv = _mm_sub_epi16(mask_max, m);

    const __m128i data_l = _mm_unpacklo_epi16(a, b);
    const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
    __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
    pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
                            AOM_BLEND_A64_ROUND_BITS);

    const __m128i data_r = _mm_unpackhi_epi16(a, b);
    const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
    __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
    pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
                            AOM_BLEND_A64_ROUND_BITS);

    const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
    const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
    res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));

    src_ptr += src_stride * 2;
    a_ptr += a_stride * 2;
    b_ptr += b_stride * 2;
    m_ptr += m_stride * 2;
  }
  res = _mm_hadd_epi32(res, res);
  res = _mm_hadd_epi32(res, res);
  int sad = _mm_cvtsi128_si32(res);
  return (sad + 31) >> 6;
}
Esempio n. 11
0
static INLINE unsigned int highbd_masked_sad_ssse3(
    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
    int width, int height) {
  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
  int x, y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
  const __m128i round_const =
      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
  const __m128i one = _mm_set1_epi16(1);

  for (y = 0; y < height; y++) {
    for (x = 0; x < width; x += 8) {
      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
      // Zero-extend mask to 16 bits
      const __m128i m = _mm_unpacklo_epi8(
          _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128());
      const __m128i m_inv = _mm_sub_epi16(mask_max, m);

      const __m128i data_l = _mm_unpacklo_epi16(a, b);
      const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
      __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
      pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
                              AOM_BLEND_A64_ROUND_BITS);

      const __m128i data_r = _mm_unpackhi_epi16(a, b);
      const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
      __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
      pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
                              AOM_BLEND_A64_ROUND_BITS);

      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
      // so it is safe to do signed saturation here.
      const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
      // There is no 16-bit SAD instruction, so we have to synthesize
      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
      // and accumulating them at the end
      const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
      res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
    }

    src_ptr += src_stride;
    a_ptr += a_stride;
    b_ptr += b_stride;
    m_ptr += m_stride;
  }
  // At this point, we have four 32-bit partial SADs stored in 'res'.
  res = _mm_hadd_epi32(res, res);
  res = _mm_hadd_epi32(res, res);
  int sad = _mm_cvtsi128_si32(res);
  return (sad + 31) >> 6;
}
Esempio n. 12
0
 SIMD_INLINE void UnpackMask32i(const uint8_t * src, uint32_t * dst, const __m128i & mask)
 {
     __m128i s = _mm_and_si128(mask, _mm_loadu_si128((__m128i*)src));
     __m128i lo = _mm_unpacklo_epi8(s, _mm_setzero_si128());
     _mm_storeu_si128((__m128i*)dst + 0, _mm_unpacklo_epi16(lo, _mm_setzero_si128()));
     _mm_storeu_si128((__m128i*)dst + 1, _mm_unpackhi_epi16(lo, _mm_setzero_si128()));
     __m128i hi = _mm_unpackhi_epi8(s, _mm_setzero_si128());
     _mm_storeu_si128((__m128i*)dst + 2, _mm_unpacklo_epi16(hi, _mm_setzero_si128()));
     _mm_storeu_si128((__m128i*)dst + 3, _mm_unpackhi_epi16(hi, _mm_setzero_si128()));
 }
static inline void
desc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags,
	struct rte_mbuf **rx_pkts)
{
	__m128i ptype0, ptype1, vtag0, vtag1;
	union {
		uint16_t e[4];
		uint64_t dword;
	} vol;

	/* mask everything except rss type */
	const __m128i rsstype_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			0x000F, 0x000F, 0x000F, 0x000F);

	/* map rss type to rss hash flag */
	const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0,
			0, 0, 0, PKT_RX_RSS_HASH,
			PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0,
			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0);

	/* mask everything except vlan present bit */
	const __m128i vlan_msk = _mm_set_epi16(
			0x0000, 0x0000,
			0x0000, 0x0000,
			IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP,
			IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP);
	/* map vlan present (0x8) to ol_flags */
	const __m128i vlan_map = _mm_set_epi8(
		0, 0, 0, 0,
		0, 0, 0, vlan_flags,
		0, 0, 0, 0,
		0, 0, 0, 0);

	ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]);
	ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]);
	vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]);
	vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]);

	ptype0 = _mm_unpacklo_epi32(ptype0, ptype1);
	ptype0 = _mm_and_si128(ptype0, rsstype_msk);
	ptype0 = _mm_shuffle_epi8(rss_flags, ptype0);

	vtag1 = _mm_unpacklo_epi32(vtag0, vtag1);
	vtag1 = _mm_and_si128(vtag1, vlan_msk);
	vtag1 = _mm_shuffle_epi8(vlan_map, vtag1);

	vtag1 = _mm_or_si128(ptype0, vtag1);
	vol.dword = _mm_cvtsi128_si64(vtag1);

	rx_pkts[0]->ol_flags = vol.e[0];
	rx_pkts[1]->ol_flags = vol.e[1];
	rx_pkts[2]->ol_flags = vol.e[2];
	rx_pkts[3]->ol_flags = vol.e[3];
}
Esempio n. 14
0
 template <bool align> SIMD_INLINE void HogDirectionHistograms(const __m128i & t, const __m128i & l, const __m128i & r, const __m128i & b, Buffer & buffer, size_t col)
 {
     HogDirectionHistograms<align>(
         _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpacklo_epi16(r, K_ZERO), _mm_unpacklo_epi16(l, K_ZERO))), 
         _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpacklo_epi16(b, K_ZERO), _mm_unpacklo_epi16(t, K_ZERO))), 
         buffer, col + 0);
     HogDirectionHistograms<align>(
         _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpackhi_epi16(r, K_ZERO), _mm_unpackhi_epi16(l, K_ZERO))), 
         _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpackhi_epi16(b, K_ZERO), _mm_unpackhi_epi16(t, K_ZERO))), 
         buffer, col + 4);
 }
Esempio n. 15
0
        template <bool align> SIMD_INLINE void GrayToBgra(uint8_t * bgra, __m128i gray, __m128i alpha)
        {
            __m128i bgLo = _mm_unpacklo_epi8(gray, gray);
            __m128i bgHi = _mm_unpackhi_epi8(gray, gray);
            __m128i raLo = _mm_unpacklo_epi8(gray, alpha);
            __m128i raHi = _mm_unpackhi_epi8(gray, alpha);

            Store<align>((__m128i*)bgra + 0, _mm_unpacklo_epi16(bgLo, raLo));
            Store<align>((__m128i*)bgra + 1, _mm_unpackhi_epi16(bgLo, raLo));
            Store<align>((__m128i*)bgra + 2, _mm_unpacklo_epi16(bgHi, raHi));
            Store<align>((__m128i*)bgra + 3, _mm_unpackhi_epi16(bgHi, raHi));
        }
Esempio n. 16
0
/*Transpose 8 vectors with 8 16-bit values.*/
OD_SIMD_INLINE void od_transpose16x8(__m128i *t0, __m128i *t1,
 __m128i *t2, __m128i *t3,  __m128i *t4, __m128i *t5,
 __m128i *t6, __m128i *t7) {
  __m128i a0;
  __m128i b0;
  __m128i c0;
  __m128i d0;
  __m128i e0;
  __m128i f0;
  __m128i g0;
  __m128i h0;
  __m128i a1;
  __m128i b1;
  __m128i c1;
  __m128i d1;
  __m128i e1;
  __m128i f1;
  __m128i g1;
  __m128i h1;
  /*00112233*/
  a0 = _mm_unpacklo_epi16(*t0, *t1);
  b0 = _mm_unpacklo_epi16(*t2, *t3);
  c0 = _mm_unpacklo_epi16(*t4, *t5);
  d0 = _mm_unpacklo_epi16(*t6, *t7);
  /*44556677*/
  e0 = _mm_unpackhi_epi16(*t0, *t1);
  f0 = _mm_unpackhi_epi16(*t2, *t3);
  g0 = _mm_unpackhi_epi16(*t4, *t5);
  h0 = _mm_unpackhi_epi16(*t6, *t7);
  /*00001111*/
  a1 = _mm_unpacklo_epi32(a0, b0);
  b1 = _mm_unpacklo_epi32(c0, d0);
  /*22223333*/
  c1 = _mm_unpackhi_epi32(a0, b0);
  d1 = _mm_unpackhi_epi32(c0, d0);
  /*44445555*/
  e1 = _mm_unpacklo_epi32(e0, f0);
  f1 = _mm_unpacklo_epi32(g0, h0);
  /*66667777*/
  g1 = _mm_unpackhi_epi32(e0, f0);
  h1 = _mm_unpackhi_epi32(g0, h0);
  *t0 = _mm_unpacklo_epi64(a1, b1);
  *t1 = _mm_unpackhi_epi64(a1, b1);
  *t2 = _mm_unpacklo_epi64(c1, d1);
  *t3 = _mm_unpackhi_epi64(c1, d1);
  *t4 = _mm_unpacklo_epi64(e1, f1);
  *t5 = _mm_unpackhi_epi64(e1, f1);
  *t6 = _mm_unpacklo_epi64(g1, h1);
  *t7 = _mm_unpackhi_epi64(g1, h1);
}
Esempio n. 17
0
static WEBP_INLINE void ConvertRGBToY_SSE41(const __m128i* const R,
                                            const __m128i* const G,
                                            const __m128i* const B,
                                            __m128i* const Y) {
  const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
  const __m128i kGB_y = MK_CST_16(16384, 6420);
  const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);

  const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
  const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
  const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
  const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
}
static void GF_FUNC_ALIGN VS_CC
proc_8bit(int radius, float *kernel, const uint8_t *srcp, float *buff,
          float *dstp, int width, int height, int src_stride, int dst_stride)
{
    int length = radius * 2 + 1;
    const uint8_t *p[17];
    for (int i = -radius; i <= radius; i++) {
        p[i + radius] = srcp + abs(i) * src_stride;
    }

    __m128i zero = _mm_setzero_si128();

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x += 16) {
            __m128 sum[4];
            sum[0] = _mm_setzero_ps();
            sum[1] = _mm_setzero_ps();
            sum[2] = _mm_setzero_ps();
            sum[3] = _mm_setzero_ps();
            
            for (int i = 0; i < length; i++) {
                __m128 f[4];
                __m128i xmm0 = _mm_load_si128((__m128i *)(p[i] + x));
                __m128i xmm1 = _mm_unpackhi_epi8(xmm0, zero);
                xmm0 = _mm_unpacklo_epi8(xmm0, zero);
                f[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero));
                f[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero));
                f[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm1, zero));
                f[3] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm1, zero));
                __m128 k = _mm_set1_ps(kernel[i]);
                
                for (int j = 0; j < 4; j++) {
                    sum[j] = _mm_add_ps(sum[j], _mm_mul_ps(k, f[j]));
                }
            }
            _mm_store_ps(buff + x,      sum[0]);
            _mm_store_ps(buff + x +  4, sum[1]);
            _mm_store_ps(buff + x +  8, sum[2]);
            _mm_store_ps(buff + x + 12, sum[3]);
        }
        proc_horizontal(buff, radius, length, width, kernel, dstp);
        
        for (int i = 0; i < length - 1; i++) {
            p[i] = p[i + 1];
        }
        p[length - 1] += (y < height - radius - 1 ? 1 : -1) * src_stride;
        dstp += dst_stride;
    }
}
Esempio n. 19
0
File: dsp.cpp Progetto: taqu/opus
    void conv_Short1ToFloat2(void* dst, const void* s, s32 numSamples)
    {
        LSfloat* d = reinterpret_cast<LSfloat*>(dst);
        const LSshort* src = reinterpret_cast<const LSshort*>(s);

        s32 num = numSamples >> 3; //8個のshortをまとめて処理
        s32 offset = num << 3;
        s32 rem = numSamples - offset;

        const __m128i izero = _mm_setzero_si128();
        const __m128 fcoff = _mm_set1_ps(1.0f/32767.0f);

        const LSshort* p = src;
        LSfloat* q = d;
        for(s32 i=0; i<num; ++i){
            __m128i t = _mm_loadu_si128((const __m128i*)p);
            __m128i s16_0 = _mm_unpackhi_epi16(t, t);
            __m128i s16_1 = _mm_unpacklo_epi16(t, t);

            __m128i t1 = _mm_cmpgt_epi16(izero, s16_0);
            __m128i t2 = _mm_cmpgt_epi16(izero, s16_1);

            __m128i s32_0 = _mm_unpackhi_epi16(s16_0, t1);
            __m128i s32_1 = _mm_unpacklo_epi16(s16_0, t1);
            __m128i s32_2 = _mm_unpackhi_epi16(s16_1, t2);
            __m128i s32_3 = _mm_unpacklo_epi16(s16_1, t2);

            //32bit浮動小数点に変換
            __m128 f32_0 = _mm_mul_ps(_mm_cvtepi32_ps(s32_0), fcoff);
            __m128 f32_1 = _mm_mul_ps(_mm_cvtepi32_ps(s32_1), fcoff);
            __m128 f32_2 = _mm_mul_ps(_mm_cvtepi32_ps(s32_2), fcoff);
            __m128 f32_3 = _mm_mul_ps(_mm_cvtepi32_ps(s32_3), fcoff);

            _mm_storeu_ps((q+0), f32_3);
            _mm_storeu_ps((q+4), f32_2);
            _mm_storeu_ps((q+8), f32_1);
            _mm_storeu_ps((q+12), f32_0);

            p += 8;
            q += 16;
        }

        for(s32 i=0; i<rem; ++i){
            s32 j = i<<1;
            q[j+0] = toFloat(p[i]);
            q[j+1] = toFloat(p[i]);
        }
    }
Esempio n. 20
0
inline static short
sse3_dot_prod (const uint16_t *p1,
               const uint16_t *p2,
               size_t          size)
{
  unsigned long res[4];
  unsigned int  i;

  __m128i* mp1  = (__m128i *)p1;
  __m128i* mp2  = (__m128i *)p2;
  __m128i  mres = _mm_set_epi32 (0, 0, 0, 0);

  for (i = 0; i < size; i += 8)
    {

      __m128i mreg1 = _mm_loadu_si128 (mp1);
      __m128i mreg2 = _mm_loadu_si128 (mp2);
      __m128i xlo1  = _mm_unpacklo_epi16 (mreg1, _mm_set1_epi16 (0));
      __m128i xlo2  = _mm_unpacklo_epi16 (mreg2, _mm_set1_epi16 (0));
      __m128i mtmp  = _mm_mullo_epi32 (xlo1, xlo2);
      mres          = _mm_add_epi32 (mres, mtmp);

      __m128i xhi1 = _mm_unpackhi_epi16 (mreg1, _mm_set1_epi16 (0));
      __m128i xhi2 = _mm_unpackhi_epi16 (mreg2, _mm_set1_epi16 (0));
      mtmp         = _mm_mullo_epi32 (xhi1, xhi2);
      mres         = _mm_add_epi32 (mres, mtmp);

      /*
      __m128i xlo1  = _mm_unpacklo_epi16 (_mm_loadu_si128 (mp1), _mm_set1_epi16 (0));
      __m128i xlo2  = _mm_unpacklo_epi16 (_mm_loadu_si128 (mp2), _mm_set1_epi16 (0));
      __m128i mtmp  = _mm_mullo_epi32 (xlo1, xlo2);
      mres          = _mm_add_epi32 (mres, mtmp);

      __m128i xhi1  = _mm_unpackhi_epi16 (_mm_loadu_si128 (mp1), _mm_set1_epi16 (0));
      __m128i xhi2  = _mm_unpackhi_epi16 (_mm_loadu_si128 (mp2), _mm_set1_epi16 (0));
      mtmp          = _mm_mullo_epi32 (xhi1, xhi2);
      mres          = _mm_add_epi32 (mres, mtmp);
      */

      mp1++;
      mp2++;
    }

  __m128i* pmres = (__m128i *)res;
  _mm_storeu_si128 (pmres, mres);

  return res[0]+res[1]+res[2]+res[3];
}
Esempio n. 21
0
static inline void
yuv_to_planar_shader_3(uint8_t** dstp, const uint8_t** srcp, const int dpitch,
    const int spitch, const int width, const int height, void* _buff) noexcept
{
    const __m128i zero = _mm_setzero_si128();
    const __m128 rcp = _mm_set1_ps(1.0f / (STACK16 ? 65535 : 255));

    float* buff = reinterpret_cast<float*>(_buff);

    for (int p = 0; p < 3; ++p) {
        const uint8_t* s = srcp[p];
        const uint8_t* lsb = s + height * spitch;
        uint8_t* d = dstp[p];

        for (int y = 0; y < height; ++y) {
            for (int x = 0; x < width; x += 16) {
                __m128i msbx = _mm_load_si128(reinterpret_cast<const __m128i*>(s + x));
                __m128i d0, lsbx;
                if (!STACK16) {
                    d0 = _mm_unpacklo_epi8(msbx, zero);
                } else {
                    lsbx = _mm_load_si128(reinterpret_cast<const __m128i*>(lsb + x));
                    d0 = _mm_unpacklo_epi8(lsbx, msbx);
                }
                __m128 f0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(d0, zero));
                _mm_store_ps(buff + x + 0, _mm_mul_ps(rcp, f0));
                f0 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(d0, zero));
                _mm_store_ps(buff + x + 4, _mm_mul_ps(rcp, f0));

                if (!STACK16) {
                    d0 = _mm_unpackhi_epi8(msbx, zero);
                } else {
                    d0 = _mm_unpackhi_epi8(lsbx, msbx);
                }
                f0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(d0, zero));
                _mm_store_ps(buff + x + 8, _mm_mul_ps(rcp, f0));
                f0 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(d0, zero));
                _mm_store_ps(buff + x + 12, _mm_mul_ps(rcp, f0));
            }
            convert_float_to_half(d, buff, width);
            s += spitch;
            d += dpitch;
            if (STACK16) {
                lsb += spitch;
            }
        }
    }
}
Esempio n. 22
0
File: dsp.cpp Progetto: taqu/opus
    //----------------------------------------------------------------------------
    void conv_Short1ToFloat1(void* dst, const void* s, s32 numSamples)
    {
        LSfloat* d = reinterpret_cast<LSfloat*>(dst);
        const LSshort* src = reinterpret_cast<const LSshort*>(s);

        s32 num = numSamples >> 3; //8個のshortをまとめて処理
        s32 offset = num << 3;
        s32 rem = numSamples - offset;

        const __m128i izero = _mm_setzero_si128();
        const __m128 fcoff = _mm_set1_ps(1.0f/32767.0f);

        const LSshort* p = src;
        LSfloat* q = d;
        for(s32 i=0; i<num; ++i){
            //32bit浮動小数点r0, r1に変換
            __m128i t0 = _mm_loadu_si128((const __m128i*)p);
            __m128i t1 = _mm_cmpgt_epi16(izero, t0);
            __m128 r0 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(t0, t1));
            __m128 r1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(t0, t1));

            r0 = _mm_mul_ps(r0, fcoff);
            r1 = _mm_mul_ps(r1, fcoff);
            _mm_storeu_ps((q+0), r1);
            _mm_storeu_ps((q+4), r0);
            p += 8;
            q += 8;
        }

        for(s32 i=0; i<rem; ++i){
            q[i] = toFloat(p[i]);
        }
    }
template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int16_t(void *_idata,
                                                                       const int istride,
                                                                       const char *odata,
                                                                       const int ostride,
                                                                       const int iwidth,
                                                                       const int iheight,
                                                                       const int ooffset_x,
                                                                       const int ooffset_y,
                                                                       const int owidth,
                                                                       const int oheight) {
  int16_t *idata = (int16_t *)_idata;
  const int skip = 1;
  const __m128i ONE = _mm_set1_epi16(1);
  const __m128i OFFSET = _mm_set1_epi16(1 << (active_bits - 1));
  const __m128i SHUF = _mm_set_epi8(15,14, 11,10, 7,6, 3,2,
                                    13,12,   9,8, 5,4, 1,0);
  const __m128i CLIP = _mm_set1_epi16((1 << active_bits) - 1);
  const __m128i ZERO = _mm_set1_epi16(0);

  (void)iwidth;
  (void)iheight;

  for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) {
    for (int x = ooffset_x; x < ooffset_x + owidth; x += 16) {
      __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]);
      __m128i D8 = _mm_load_si128((__m128i *)&idata[y*istride + x + 8]);

      D0 = _mm_shuffle_epi8(D0, SHUF);
      D8 = _mm_shuffle_epi8(D8, SHUF);

      __m128i E0 = _mm_unpacklo_epi64(D0, D8);
      __m128i O1 = _mm_unpackhi_epi64(D0, D8);

      __m128i X0 = _mm_sub_epi16(E0, _mm_srai_epi16(_mm_add_epi16(O1, ONE), 1));
      __m128i X1 = _mm_add_epi16(O1, X0);

      __m128i Z0 = _mm_unpacklo_epi16(X0, X1);
      __m128i Z8 = _mm_unpackhi_epi16(X0, X1);

      if (shift != 0) {
        Z0 = _mm_add_epi16(Z0, ONE);
        Z8 = _mm_add_epi16(Z8, ONE);
        Z0 = _mm_srai_epi16(Z0, shift);
        Z8 = _mm_srai_epi16(Z8, shift);
      }

      Z0 = _mm_add_epi16(Z0, OFFSET);
      Z8 = _mm_add_epi16(Z8, OFFSET);

      Z0 = _mm_min_epi16(Z0, CLIP);
      Z8 = _mm_min_epi16(Z8, CLIP);

      Z0 = _mm_max_epi16(Z0, ZERO);
      Z8 = _mm_max_epi16(Z8, ZERO);

      _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 0 - ooffset_x)], Z0);
      _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 8 - ooffset_x)], Z8);
    }
  }
}
Esempio n. 24
0
static void ConvertBGRAToRGBA(const uint32_t* src,
                              int num_pixels, uint8_t* dst) {
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
  while (num_pixels >= 8) {
    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);   // b0b2b4b6g0g2g4g6...
    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);   // b1b3b5b7g1g3g5g7...
    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);   // b0...b7 | g0...g7
    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);   // r0...r7 | a0...a7
    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);  // g0...g7 | a0...a7
    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);  // r0...r7 | b0...b7
    const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0);   // r0g0r1g1 ... r6g6r7g7
    const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0);   // b0a0b1a1 ... b6a6b7a7
    const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0);  // rgba0|rgba1...
    const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0);  // rgba4|rgba5...
    _mm_storeu_si128(out++, rgba0);
    _mm_storeu_si128(out++, rgba4);
    num_pixels -= 8;
  }
  // left-overs
  VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
}
Esempio n. 25
0
static long
conv_rgba16_rgbaF (const uint16_t *src, float *dst, long samples)
{
  long i = 0;

  if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
    {
      long           n  = (samples / 2) * 2;
      const __m128i *s  = (const __m128i*) src;
            __v4sf  *d  = (__v4sf*) dst;

      for (; i < n / 2; i++)
        {
          /* Expand shorts to ints by loading zero in the high bits */
          const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
          const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());

          /* Convert to float */
          const __m128  u0 = _mm_cvtepi32_ps (t0);
          const __m128  u1 = _mm_cvtepi32_ps (t1);

          const __v4sf rgba0 = u0 * u16_float;
          const __v4sf rgba1 = u1 * u16_float;

          d[2 * i + 0] = rgba0;
          d[2 * i + 1] = rgba1;
        }
      _mm_empty();
    }

  for (i *= 2 * 4; i != 4 * samples; i++)
    dst[i] = src[i] * (1.f / 65535);

  return samples;
}
Esempio n. 26
0
/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
static void
unshuffle4_sse2(uint8_t* const dest, const uint8_t* const src,
                const size_t vectorizable_elements, const size_t total_elements) {
  static const size_t bytesoftype = 4;
  size_t i;
  int j;
  __m128i xmm0[4], xmm1[4];

  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
    /* Load 16 elements (64 bytes) into 4 XMM registers. */
    const uint8_t* const src_for_ith_element = src + i;
    for (j = 0; j < 4; j++) {
      xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
    }
    /* Shuffle bytes */
    for (j = 0; j < 2; j++) {
      /* Compute the low 32 bytes */
      xmm1[j] = _mm_unpacklo_epi8(xmm0[j * 2], xmm0[j * 2 + 1]);
      /* Compute the hi 32 bytes */
      xmm1[2 + j] = _mm_unpackhi_epi8(xmm0[j * 2], xmm0[j * 2 + 1]);
    }
    /* Shuffle 2-byte words */
    for (j = 0; j < 2; j++) {
      /* Compute the low 32 bytes */
      xmm0[j] = _mm_unpacklo_epi16(xmm1[j * 2], xmm1[j * 2 + 1]);
      /* Compute the hi 32 bytes */
      xmm0[2 + j] = _mm_unpackhi_epi16(xmm1[j * 2], xmm1[j * 2 + 1]);
    }
    /* Store the result vectors in proper order */
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm0[0]);
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm0[2]);
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm0[1]);
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm0[3]);
  }
}
Esempio n. 27
0
int aom_satd_sse2(const tran_low_t *coeff, int length) {
  int i;
  const __m128i zero = _mm_setzero_si128();
  __m128i accum = zero;

  for (i = 0; i < length; i += 8) {
    const __m128i src_line = load_tran_low(coeff);
    const __m128i inv = _mm_sub_epi16(zero, src_line);
    const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
    const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
    const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
    const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
    accum = _mm_add_epi32(accum, sum);
    coeff += 8;
  }

  {  // cascading summation of accum
    __m128i hi = _mm_srli_si128(accum, 8);
    accum = _mm_add_epi32(accum, hi);
    hi = _mm_srli_epi64(accum, 32);
    accum = _mm_add_epi32(accum, hi);
  }

  return _mm_cvtsi128_si32(accum);
}
Esempio n. 28
0
void av1_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
  __m128i in0, in1;
  __m128i tmp;
  const __m128i zero = _mm_setzero_si128();
  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
  in1 = _mm_unpacklo_epi64(
      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
  in0 = _mm_unpacklo_epi64(
      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));

  tmp = _mm_add_epi16(in0, in1);
  in0 = _mm_unpacklo_epi16(zero, tmp);
  in1 = _mm_unpackhi_epi16(zero, tmp);
  in0 = _mm_srai_epi32(in0, 16);
  in1 = _mm_srai_epi32(in1, 16);

  tmp = _mm_add_epi32(in0, in1);
  in0 = _mm_unpacklo_epi32(tmp, zero);
  in1 = _mm_unpackhi_epi32(tmp, zero);

  tmp = _mm_add_epi32(in0, in1);
  in0 = _mm_srli_si128(tmp, 8);

  in1 = _mm_add_epi32(tmp, in0);
  in0 = _mm_slli_epi32(in1, 1);
  store_output(&in0, output);
}
Esempio n. 29
0
void SoundSSE::unpack_16bit_stereo(short *input, int size, float *output[2])
{
#ifndef CL_DISABLE_SSE2
	int sse_size = (size/8)*8;

	__m128i zero = _mm_setzero_si128();
	__m128 constant1 = _mm_set1_ps(1.0f/32768.0f);
	for (int i = 0; i < sse_size; i+=8)
	{
		__m128i isamples = _mm_loadu_si128((__m128i*)(input+i));
	
		__m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples), 16));
		__m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples), 16));
		samples0 = _mm_mul_ps(samples0, constant1);
		samples1 = _mm_mul_ps(samples1, constant1);
	
		__m128 tmp0, tmp1;
		tmp0 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(2,0,2,0));
		tmp1 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(3,1,3,1));
		_mm_storeu_ps(output[0]+i/2, tmp0);
		_mm_storeu_ps(output[1]+i/2, tmp1);
	}
#else
	const int sse_size = 0;
#endif
	// unpack remaining
	for (int i = sse_size; i < size; i+=2)
	{
		output[0][i/2] = ((float) input[i]) / 32767.0f;
		output[1][i/2] = ((float) input[i+1]) / 32767.0f;
	}
}
Esempio n. 30
0
void SoundSSE::unpack_16bit_mono(short *input, int size, float *output)
{
#ifndef CL_DISABLE_SSE2
	int sse_size = (size/8)*8;

	__m128i zero = _mm_setzero_si128();
	__m128 constant1 = _mm_set1_ps(1.0f/32767.0f);
	for (int i = 0; i < sse_size; i+=8)
	{
		__m128i isamples = _mm_loadu_si128((__m128i*)(input+i));
		__m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples), 16));
		__m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples), 16));
		samples0 = _mm_mul_ps(samples0, constant1);
		samples1 = _mm_mul_ps(samples1, constant1);
		_mm_storeu_ps(output+i+0, samples0);
		_mm_storeu_ps(output+i+4, samples1);
	}
#else
	const int sse_size = 0;
#endif

	// unpack remaining
	for (int i = sse_size; i < size; i++)
	{
		output[i] = ((float) input[i]) / 32767.0f;
	}
}