Exemplo n.º 1
1
static inline __m128i local_abs_epi32(__m128i val)
{
	__m128i mask = _mm_srai_epi32(val, 31);
	val = _mm_xor_si128(val, mask);
	val = _mm_sub_epi32(val, mask);
	return val;
}
Exemplo n.º 2
0
void SoundSSE::unpack_16bit_mono(short *input, int size, float *output)
{
#ifndef CL_DISABLE_SSE2
	int sse_size = (size/8)*8;

	__m128i zero = _mm_setzero_si128();
	__m128 constant1 = _mm_set1_ps(1.0f/32767.0f);
	for (int i = 0; i < sse_size; i+=8)
	{
		__m128i isamples = _mm_loadu_si128((__m128i*)(input+i));
		__m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples), 16));
		__m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples), 16));
		samples0 = _mm_mul_ps(samples0, constant1);
		samples1 = _mm_mul_ps(samples1, constant1);
		_mm_storeu_ps(output+i+0, samples0);
		_mm_storeu_ps(output+i+4, samples1);
	}
#else
	const int sse_size = 0;
#endif

	// unpack remaining
	for (int i = sse_size; i < size; i++)
	{
		output[i] = ((float) input[i]) / 32767.0f;
	}
}
Exemplo n.º 3
0
void SoundSSE::unpack_16bit_stereo(short *input, int size, float *output[2])
{
#ifndef CL_DISABLE_SSE2
	int sse_size = (size/8)*8;

	__m128i zero = _mm_setzero_si128();
	__m128 constant1 = _mm_set1_ps(1.0f/32768.0f);
	for (int i = 0; i < sse_size; i+=8)
	{
		__m128i isamples = _mm_loadu_si128((__m128i*)(input+i));
	
		__m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples), 16));
		__m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples), 16));
		samples0 = _mm_mul_ps(samples0, constant1);
		samples1 = _mm_mul_ps(samples1, constant1);
	
		__m128 tmp0, tmp1;
		tmp0 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(2,0,2,0));
		tmp1 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(3,1,3,1));
		_mm_storeu_ps(output[0]+i/2, tmp0);
		_mm_storeu_ps(output[1]+i/2, tmp1);
	}
#else
	const int sse_size = 0;
#endif
	// unpack remaining
	for (int i = sse_size; i < size; i+=2)
	{
		output[0][i/2] = ((float) input[i]) / 32767.0f;
		output[1][i/2] = ((float) input[i+1]) / 32767.0f;
	}
}
Exemplo n.º 4
0
void av1_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
  __m128i in0, in1;
  __m128i tmp;
  const __m128i zero = _mm_setzero_si128();
  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
  in1 = _mm_unpacklo_epi64(
      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
  in0 = _mm_unpacklo_epi64(
      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));

  tmp = _mm_add_epi16(in0, in1);
  in0 = _mm_unpacklo_epi16(zero, tmp);
  in1 = _mm_unpackhi_epi16(zero, tmp);
  in0 = _mm_srai_epi32(in0, 16);
  in1 = _mm_srai_epi32(in1, 16);

  tmp = _mm_add_epi32(in0, in1);
  in0 = _mm_unpacklo_epi32(tmp, zero);
  in1 = _mm_unpackhi_epi32(tmp, zero);

  tmp = _mm_add_epi32(in0, in1);
  in0 = _mm_srli_si128(tmp, 8);

  in1 = _mm_add_epi32(tmp, in0);
  in0 = _mm_slli_epi32(in1, 1);
  store_output(&in0, output);
}
void
interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
    gint len, const gpointer icp, gint astride)
{
  gint i = 0;
  gint16 *o = op, *a = ap, *ic = icp;
  __m128i ta, tb, t1, t2;
  __m128i f = _mm_set_epi64x (0, *((gint64 *) ic));
  const gint16 *c[2] = { (gint16 *) ((gint8 *) a + 0 * astride),
    (gint16 *) ((gint8 *) a + 1 * astride)
  };

  f = _mm_unpacklo_epi32 (f, f);
  f = _mm_unpacklo_epi64 (f, f);

  for (; i < len; i += 8) {
    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
    tb = _mm_load_si128 ((__m128i *) (c[1] + i));

    t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f);
    t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f);

    t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
    t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));

    t1 = _mm_srai_epi32 (t1, PRECISION_S16);
    t2 = _mm_srai_epi32 (t2, PRECISION_S16);

    t1 = _mm_packs_epi32 (t1, t2);
    _mm_store_si128 ((__m128i *) (o + i), t1);
  }
}
Exemplo n.º 6
0
static void FTransformWHT(const int16_t* in, int16_t* out) {
  int32_t tmp[16];
  int i;
  for (i = 0; i < 4; ++i, in += 64) {
    const int a0 = (in[0 * 16] + in[2 * 16]);
    const int a1 = (in[1 * 16] + in[3 * 16]);
    const int a2 = (in[1 * 16] - in[3 * 16]);
    const int a3 = (in[0 * 16] - in[2 * 16]);
    tmp[0 + i * 4] = a0 + a1;
    tmp[1 + i * 4] = a3 + a2;
    tmp[2 + i * 4] = a3 - a2;
    tmp[3 + i * 4] = a0 - a1;
  }
  {
    const __m128i src0 = _mm_loadu_si128((__m128i*)&tmp[0]);
    const __m128i src1 = _mm_loadu_si128((__m128i*)&tmp[4]);
    const __m128i src2 = _mm_loadu_si128((__m128i*)&tmp[8]);
    const __m128i src3 = _mm_loadu_si128((__m128i*)&tmp[12]);
    const __m128i a0 = _mm_add_epi32(src0, src2);
    const __m128i a1 = _mm_add_epi32(src1, src3);
    const __m128i a2 = _mm_sub_epi32(src1, src3);
    const __m128i a3 = _mm_sub_epi32(src0, src2);
    const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1);
    const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1);
    const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1);
    const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1);
    const __m128i out0 = _mm_packs_epi32(b0, b1);
    const __m128i out1 = _mm_packs_epi32(b2, b3);
    _mm_storeu_si128((__m128i*)&out[0], out0);
    _mm_storeu_si128((__m128i*)&out[8], out1);
  }
}
Exemplo n.º 7
0
static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
  __m128i v0, v1;
  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));

  u[0] = _mm_loadu_si128((__m128i const *)src);
  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));

  u[0] = _mm_add_epi32(u[0], rnd);
  u[1] = _mm_add_epi32(u[1], rnd);
  u[2] = _mm_add_epi32(u[2], rnd);
  u[3] = _mm_add_epi32(u[3], rnd);

  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
  u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
  u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
  u[3] = _mm_srai_epi32(u[3], FILTER_BITS);

  u[0] = _mm_packus_epi32(u[0], u[1]);
  u[1] = _mm_packus_epi32(u[2], u[3]);

  highbd_clip(u, 2, bd);

  v0 = _mm_unpacklo_epi16(u[0], u[1]);
  v1 = _mm_unpackhi_epi16(u[0], u[1]);

  u[0] = _mm_unpacklo_epi16(v0, v1);
  u[2] = _mm_unpackhi_epi16(v0, v1);

  u[1] = _mm_srli_si128(u[0], 8);
  u[3] = _mm_srli_si128(u[2], 8);
}
static inline void
inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
  gint i = 0;
  __m128i sum[4], t[4];
  __m128i f = _mm_set_epi64x (0, *((long long *) icoeff));
  const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride),
    (gint16 *) ((gint8 *) b + 1 * bstride),
    (gint16 *) ((gint8 *) b + 2 * bstride),
    (gint16 *) ((gint8 *) b + 3 * bstride)
  };

  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
  f = _mm_unpacklo_epi16 (f, sum[0]);

  for (; i < len; i += 8) {
    t[0] = _mm_loadu_si128 ((__m128i *) (a + i));
    sum[0] =
        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[0] + i))));
    sum[1] =
        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[1] + i))));
    sum[2] =
        _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[2] + i))));
    sum[3] =
        _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[3] + i))));
  }
  t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]);
  t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]);
  t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]);
  t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]);

  sum[0] =
      _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0],
          t[1]));
  sum[2] =
      _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2],
          t[3]));
  sum[0] = _mm_add_epi32 (sum[0], sum[2]);

  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
  sum[0] = _mm_madd_epi16 (sum[0], f);

  sum[0] =
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
              3)));
  sum[0] =
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
              1)));

  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
  *o = _mm_extract_epi16 (sum[0], 0);
}
Exemplo n.º 9
0
static INLINE unsigned int highbd_masked_sad4xh_ssse3(
    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
    int height) {
  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
  int y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
  const __m128i round_const =
      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
  const __m128i one = _mm_set1_epi16(1);

  for (y = 0; y < height; y += 2) {
    const __m128i src = _mm_unpacklo_epi64(
        _mm_loadl_epi64((const __m128i *)src_ptr),
        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
    const __m128i a =
        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr),
                           _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]));
    const __m128i b =
        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr),
                           _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
    // Zero-extend mask to 16 bits
    const __m128i m = _mm_unpacklo_epi8(
        _mm_unpacklo_epi32(
            _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
            _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
        _mm_setzero_si128());
    const __m128i m_inv = _mm_sub_epi16(mask_max, m);

    const __m128i data_l = _mm_unpacklo_epi16(a, b);
    const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
    __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
    pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
                            AOM_BLEND_A64_ROUND_BITS);

    const __m128i data_r = _mm_unpackhi_epi16(a, b);
    const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
    __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
    pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
                            AOM_BLEND_A64_ROUND_BITS);

    const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
    const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
    res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));

    src_ptr += src_stride * 2;
    a_ptr += a_stride * 2;
    b_ptr += b_stride * 2;
    m_ptr += m_stride * 2;
  }
  res = _mm_hadd_epi32(res, res);
  res = _mm_hadd_epi32(res, res);
  int sad = _mm_cvtsi128_si32(res);
  return (sad + 31) >> 6;
}
Exemplo n.º 10
0
static INLINE unsigned int highbd_masked_sad_ssse3(
    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
    int width, int height) {
  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
  int x, y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
  const __m128i round_const =
      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
  const __m128i one = _mm_set1_epi16(1);

  for (y = 0; y < height; y++) {
    for (x = 0; x < width; x += 8) {
      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
      // Zero-extend mask to 16 bits
      const __m128i m = _mm_unpacklo_epi8(
          _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128());
      const __m128i m_inv = _mm_sub_epi16(mask_max, m);

      const __m128i data_l = _mm_unpacklo_epi16(a, b);
      const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
      __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
      pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
                              AOM_BLEND_A64_ROUND_BITS);

      const __m128i data_r = _mm_unpackhi_epi16(a, b);
      const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
      __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
      pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
                              AOM_BLEND_A64_ROUND_BITS);

      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
      // so it is safe to do signed saturation here.
      const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
      // There is no 16-bit SAD instruction, so we have to synthesize
      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
      // and accumulating them at the end
      const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
      res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
    }

    src_ptr += src_stride;
    a_ptr += a_stride;
    b_ptr += b_stride;
    m_ptr += m_stride;
  }
  // At this point, we have four 32-bit partial SADs stored in 'res'.
  res = _mm_hadd_epi32(res, res);
  res = _mm_hadd_epi32(res, res);
  int sad = _mm_cvtsi128_si32(res);
  return (sad + 31) >> 6;
}
Exemplo n.º 11
0
template<> void
cvtScale_<short, int, float>( const short* src, size_t sstep,
           int* dst, size_t dstep, Size size,
           float scale, float shift )
{
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);

    for( ; size.height--; src += sstep, dst += dstep )
    {
        int x = 0;

         #if CV_SSE2
            if(USE_SSE2)//~5X
            {
                __m128 scale128 = _mm_set1_ps (scale);
                __m128 shift128 = _mm_set1_ps (shift);
                for(; x <= size.width - 8; x += 8 )
                {
                    __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
                    __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
                    __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
                    __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
                    rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
                    rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
                    r0 = _mm_cvtps_epi32(rf0);
                    r1 = _mm_cvtps_epi32(rf1);

                    _mm_storeu_si128((__m128i*)(dst + x), r0);
                    _mm_storeu_si128((__m128i*)(dst + x + 4), r1);
                }
            }
        #endif

        //We will wait Haswell
        /*
        #if CV_AVX
            if(USE_AVX)//2X - bad variant
            {
                ////TODO:AVX implementation (optimization?) required
                __m256 scale256 = _mm256_set1_ps (scale);
                __m256 shift256 = _mm256_set1_ps (shift);
                for(; x <= size.width - 8; x += 8 )
                {
                    __m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x)));
                    __m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256);
                    __m256i res = _mm256_cvtps_epi32(r0);
                    _mm256_storeu_si256 ((__m256i*)(dst+x), res);
                }
            }
        #endif*/

        for(; x < size.width; x++ )
            dst[x] = saturate_cast<int>(src[x]*scale + shift);
    }
}
template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int32_t(void *_idata,
																			   const int istride,
																			   const char *odata,
																			   const int ostride,
																			   const int iwidth,
																			   const int iheight,
																			   const int ooffset_x,
																			   const int ooffset_y,
																			   const int owidth,
																			   const int oheight) {
  int32_t *idata = (int32_t *)_idata;
  const int skip = 1;
  const __m128i ONE = _mm_set1_epi32(1);
  const __m128i OFFSET = _mm_set1_epi32(1 << (active_bits - 1));

  (void)iwidth;
  (void)iheight;

  for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) {
    for (int x = ooffset_x; x < ooffset_x + owidth; x += 8) {
      __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]);
      __m128i D4 = _mm_load_si128((__m128i *)&idata[y*istride + x + 4]);

      __m128i A0 = _mm_unpacklo_epi32(D0, D4);
      __m128i A2 = _mm_unpackhi_epi32(D0, D4);

      __m128i E0 = _mm_unpacklo_epi32(A0, A2);
      __m128i O1 = _mm_unpackhi_epi32(A0, A2);

      __m128i X0 = _mm_sub_epi32(E0, _mm_srai_epi32(_mm_add_epi32(O1, ONE), 1));
      __m128i X1 = _mm_add_epi32(O1, X0);

      __m128i Z0 = _mm_unpacklo_epi32(X0, X1);
      __m128i Z4 = _mm_unpackhi_epi32(X0, X1);

      if (shift != 0) {
        Z0 = _mm_add_epi32(Z0, ONE);
        Z4 = _mm_add_epi32(Z4, ONE);
        Z0 = _mm_srai_epi32(Z0, shift);
        Z4 = _mm_srai_epi32(Z4, shift);
      }

      Z0 = _mm_add_epi32(Z0, OFFSET);
      Z4 = _mm_add_epi32(Z4, OFFSET);

      Z0 = _mm_slli_epi32(Z0, (16 - active_bits));
      Z4 = _mm_slli_epi32(Z4, (16 - active_bits));

      __m128i R = _mm_packus_epi32(Z0, Z4);

      R = _mm_srli_epi16(R, (16 - active_bits));
      _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x - ooffset_x)], R);
    }
  }
}
Exemplo n.º 13
0
static void GF_FUNC_ALIGN VS_CC
proc_16bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
                uint8_t *d, const uint8_t *s, int th)
{
    const uint16_t *srcp = (uint16_t *)s;
    uint16_t *dstp = (uint16_t *)d;
    stride /= 2;
    bstride /= 2;

    uint16_t *p0 = (uint16_t *)buff + 8;
    uint16_t *p1 = p0 + bstride;
    uint16_t *p2 = p1 + bstride;
    uint16_t *orig = p0, *end = p2;

    line_copy16(p0, srcp + stride, width, 1);
    line_copy16(p1, srcp, width, 1);

    int16_t threshold = (int16_t)th;

    __m128i zero = _mm_setzero_si128();
    __m128i xth  = _mm_set1_epi16(threshold);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 1 ? 1 : -1);
        line_copy16(p2, srcp, width, 1);
        uint16_t *coordinates[] = COORDINATES;
        for (int x = 0; x < width; x += 8) {
            __m128i sumlo = zero;
            __m128i sumhi = zero;

            for (int i = 0; i < 8; i++) {
                __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x));
                sumlo = _mm_add_epi32(sumlo, _mm_unpacklo_epi16(target, zero));
                sumhi = _mm_add_epi32(sumhi, _mm_unpackhi_epi16(target, zero));
            }

            sumlo = _mm_srai_epi32(sumlo, 3);
            sumhi = _mm_srai_epi32(sumhi, 3);
            sumlo = mm_cast_epi32(sumlo, sumhi);

            __m128i src = _mm_load_si128((__m128i *)(p1 + x));
            __m128i limit = _mm_adds_epu16(src, xth);

            sumlo = MM_MAX_EPU16(sumlo, src);
            sumlo = MM_MIN_EPU16(sumlo, limit);

            _mm_store_si128((__m128i *)(dstp + x), sumlo);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = (p2 == end) ? orig : p2 + bstride;
    }
}
static inline void
inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a,
    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
  gint i = 0;
  __m128i sum[2], t;
  __m128i f = _mm_set_epi64x (0, *((gint64 *) icoeff));
  const gint16 *c[2] = { (gint16 *) ((gint8 *) b + 0 * bstride),
    (gint16 *) ((gint8 *) b + 1 * bstride)
  };

  sum[0] = sum[1] = _mm_setzero_si128 ();
  f = _mm_unpacklo_epi16 (f, sum[0]);

  for (; i < len; i += 16) {
    t = _mm_loadu_si128 ((__m128i *) (a + i + 0));
    sum[0] =
        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (c[0] + i + 0))));
    sum[1] =
        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (c[1] + i + 0))));

    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
    sum[0] =
        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (c[0] + i + 8))));
    sum[1] =
        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (c[1] + i + 8))));
  }
  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
  sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16);

  sum[0] =
      _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
  sum[1] =
      _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
  sum[0] = _mm_add_epi32 (sum[0], sum[1]);

  sum[0] =
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
              3)));
  sum[0] =
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
              1)));

  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
  *o = _mm_extract_epi16 (sum[0], 0);
}
Exemplo n.º 15
0
mlib_status
__mlib_VectorConvert_S16_S32_Mod(
	mlib_s16 *z,
	const mlib_s32 *x,
	mlib_s32 n)
{
	if (n < 1)
		return (MLIB_FAILURE);

	mlib_s32 i, ax, az, nstep, n1, n2, n3;
	mlib_s32 *px = (mlib_s32 *)x;
	mlib_s16 *pz = (mlib_s16 *)z;
	__m128i zbuf, xlo, xhi, mask;

	ax = (mlib_addr)x & 15;
	az = (mlib_addr)z & 15;

	nstep = 16 / sizeof (mlib_s16);
	n1 = ((16 - ax) & 15) / sizeof (mlib_s32);
	n2 = (n - n1) / nstep;
	n3 = n - n1 - n2 * nstep;

	if (n2 < 1) {
		for (i = 0; i < n; i++) {
			*pz++ = *px++;
		}
	} else {
		for (i = 0; i < n1; i++) {
			*pz++ = *px++;
		}

		for (i = 0; i < n2; i++) {
			xlo = _mm_load_si128((__m128i *)px);
			xhi = _mm_load_si128((__m128i *)px + 1);
			xlo = _mm_slli_epi32(xlo, 16);
			xhi = _mm_slli_epi32(xhi, 16);
			xlo = _mm_srai_epi32(xlo, 16);
			xhi = _mm_srai_epi32(xhi, 16);
			zbuf = _mm_packs_epi32(xlo, xhi);
			_mm_storeu_si128((__m128i *)pz, zbuf);
			px += nstep;
			pz += nstep;
		}

		for (i = 0; i < n3; i++) {
			*pz++ = *px++;
		}
	}

	return (MLIB_SUCCESS);
}
Exemplo n.º 16
0
inline FORCE_INLINE __m128i export_i30_u16(__m128i lo, __m128i hi, uint16_t limit)
{
	const __m128i round = _mm_set1_epi32(1 << 13);

	lo = _mm_add_epi32(lo, round);
	hi = _mm_add_epi32(hi, round);

	lo = _mm_srai_epi32(lo, 14);
	hi = _mm_srai_epi32(hi, 14);

	lo = _mm_packs_epi32(lo, hi);

	return lo;
}
    // Unary Ops
    SIMDValue SIMDInt32x4Operation::OpAbs(const SIMDValue& value)
    {
        SIMDValue result;

        X86SIMDValue x86Result;
        X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value);
        if (AutoSystemInfo::Data.SSE3Available())
        {
            x86Result.m128i_value = _mm_abs_epi32(v.m128i_value); // only available after SSE3
            result = X86SIMDValue::ToSIMDValue(x86Result);
        }
        else if (AutoSystemInfo::Data.SSE2Available())
        {
            X86SIMDValue  temp, SIGNMASK;
            SIGNMASK.m128i_value = _mm_srai_epi32(v.m128i_value, 31);                // mask = value >> 31
            temp.m128i_value = _mm_xor_si128(v.m128i_value, SIGNMASK.m128i_value);   // temp = value ^ mask
            x86Result.m128i_value = _mm_sub_epi32(temp.m128i_value, SIGNMASK.m128i_value);  // temp - mask
            result = X86SIMDValue::ToSIMDValue(x86Result);
        }
        else
        {
            result.i32[SIMD_X] = (value.i32[SIMD_X] < 0) ? -1 * value.i32[SIMD_X] : value.i32[SIMD_X];
            result.i32[SIMD_Y] = (value.i32[SIMD_Y] < 0) ? -1 * value.i32[SIMD_Y] : value.i32[SIMD_Y];
            result.i32[SIMD_Z] = (value.i32[SIMD_Z] < 0) ? -1 * value.i32[SIMD_Z] : value.i32[SIMD_Z];
            result.i32[SIMD_W] = (value.i32[SIMD_W] < 0) ? -1 * value.i32[SIMD_W] : value.i32[SIMD_W];
        }

        return result;
    }
static inline void
inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a,
    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
  gint i;
  __m128i sum, t;

  sum = _mm_setzero_si128 ();

  for (i = 0; i < len; i += 16) {
    t = _mm_loadu_si128 ((__m128i *) (a + i));
    sum =
        _mm_add_epi32 (sum, _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (b + i + 0))));

    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
    sum =
        _mm_add_epi32 (sum, _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (b + i + 8))));
  }
  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3)));
  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1)));

  sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
  sum = _mm_srai_epi32 (sum, PRECISION_S16);
  sum = _mm_packs_epi32 (sum, sum);
  *o = _mm_extract_epi16 (sum, 0);
}
Exemplo n.º 19
0
__inline__ static void mul4(__m128i in, __m128i *out) {
	const __m128i shuf = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
	const __m128i mask = _mm_set_epi32(135, 1, 1, 1);
	block intmp = _mm_shuffle_epi8(in, shuf);
	block tmp = _mm_srai_epi32(intmp, 31);
	tmp = _mm_and_si128(tmp, mask);
	tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 1, 0, 3));
	*out = _mm_slli_epi32(intmp, 1);
	*out = _mm_xor_si128(*out, tmp);
	tmp = _mm_srai_epi32(*out, 31);
	tmp = _mm_and_si128(tmp, mask);
	tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 1, 0, 3));
	*out = _mm_slli_epi32(*out, 1);
	*out = _mm_xor_si128(*out, tmp);
	*out = _mm_shuffle_epi8(*out, shuf);
}
Exemplo n.º 20
0
__m128i test_mm_srai_epi32(__m128i A) {
  // DAG-LABEL: test_mm_srai_epi32
  // DAG: call <4 x i32> @llvm.x86.sse2.psrai.d
  //
  // ASM-LABEL: test_mm_srai_epi32
  // ASM: psrad
  return _mm_srai_epi32(A, 1);
}
Exemplo n.º 21
0
void SoundSSE::unpack_8bit_stereo(unsigned char *input, int size, float *output[2])
{
#ifndef CL_DISABLE_SSE2
	int sse_size = (size/16)*16;

	__m128i zero = _mm_setzero_si128();
	__m128 constant1 = _mm_set1_ps(1.0f/128.0f);
	__m128i constant2 = _mm_set1_epi16(128);
	for (int i = 0; i < sse_size; i+=16)
	{
		__m128i isamples = _mm_loadu_si128((__m128i*)(input+i));
		__m128i isamples0 = _mm_sub_epi16(_mm_unpacklo_epi8(isamples, zero), constant2);
		__m128i isamples1 = _mm_sub_epi16(_mm_unpackhi_epi8(isamples, zero), constant2);
		__m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples0), 16));
		__m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples0), 16));
		__m128 samples2 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples1), 16));
		__m128 samples3 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples1), 16));
		samples0 = _mm_mul_ps(samples0, constant1);
		samples1 = _mm_mul_ps(samples1, constant1);
		samples2 = _mm_mul_ps(samples2, constant1);
		samples3 = _mm_mul_ps(samples3, constant1);

		__m128 tmp0, tmp1, tmp2, tmp3;
		tmp0 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(2,0,2,0));
		tmp1 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(3,1,3,1));
		tmp2 = _mm_shuffle_ps(samples2, samples3, _MM_SHUFFLE(2,0,2,0));
		tmp3 = _mm_shuffle_ps(samples2, samples3, _MM_SHUFFLE(3,1,3,1));

		 _mm_storeu_ps(output[0]+i/2, tmp0);
		 _mm_storeu_ps(output[1]+i/2, tmp1);
		 _mm_storeu_ps(output[0]+i/2+4, tmp2);
		 _mm_storeu_ps(output[1]+i/2+4, tmp3);
	}
#else
	const int sse_size = 0;
#endif
	// unpack remaining
	for (int i = sse_size; i < size; i+=2)
	{
		int value = input[i];
		output[0][i/2] = ((float) (value - 128)) / 128.0f;

		value = input[i+1];
		output[1][i/2] = ((float) (value - 128)) / 128.0f;
	}
}
Exemplo n.º 22
0
static WEBP_INLINE __m128i VP8GetRGBA32b(int y, int u, int v) {
  const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m);
  const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m);
  const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m);
  const __m128i uv_part = _mm_add_epi32(u_part, v_part);
  const __m128i rgba1 = _mm_add_epi32(y_part, uv_part);
  const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2);
  return rgba2;
}
    SIMDValue SIMDInt32x4Operation::OpShiftRightArithmetic(const SIMDValue& value, int count)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpValue = X86SIMDValue::ToX86SIMDValue(value);
        // Shifts the 4 signed 32-bit integers right by count bits while shifting in the sign bit
        x86Result.m128i_value = _mm_srai_epi32(tmpValue.m128i_value, count);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Exemplo n.º 24
0
static float Atan(float y, float x) // #include <emmintrin.h>, #include <xmmintrin.h>
{
	const __m128 _ps_atan_p0 = _mm_set1_ps(-0.0464964749f);
	const __m128 _ps_atan_p1 = _mm_set1_ps(0.15931422f);
	const __m128 _ps_atan_p2 = _mm_set1_ps(0.327622764f);

	const __m128 _ps_pi    = _mm_set1_ps(pi);
	const __m128 _ps_pi0p5 = _mm_set1_ps(pi0p5);

	const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000));
	const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));

	__m128 mm1, mm2, mm3;
	__m128 axm, aym;

	__m128 xm = _mm_set1_ps(x);
	__m128 ym = _mm_set1_ps(y);

	axm = _mm_and_ps(xm, _mask_sign_inv);
	aym = _mm_and_ps(ym, _mask_sign_inv);

	mm1 = _mm_min_ps(axm, aym);
	mm2 = _mm_max_ps(axm, aym);
	mm1 = _mm_div_ps(mm1, mm2);
	mm2 = _mm_mul_ps(mm1, mm1);

	mm3 = _mm_mul_ps(mm2, _ps_atan_p0);
	mm3 = _mm_add_ps(mm3, _ps_atan_p1);
	mm3 = _mm_mul_ps(mm3, mm2);
	mm3 = _mm_sub_ps(mm3, _ps_atan_p2);
	mm3 = _mm_mul_ps(mm3, mm2);
	mm3 = _mm_mul_ps(mm3, mm1);
	mm3 = _mm_add_ps(mm3, mm1);

	__m128 mask;

	/* |y| > |x| */
	mask = _mm_cmpgt_ss(aym, axm);
	mm2 = _mm_and_ps(_ps_pi0p5, mask);
	mm1 = _mm_and_ps(_mask_sign_raw, mask);
	mm3 = _mm_xor_ps(mm3, mm1);
	mm3 = _mm_add_ps(mm3, mm2);

	/* x < 0 */
	mask = _mm_and_ps(xm, _mask_sign_raw);
	mm3 = _mm_xor_ps(mm3, mask);
	mm1 = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(mm3), 30));
	mm1 = _mm_and_ps(_ps_pi, mm1);
	mm3 = _mm_add_ps(mm3, mm1);

	/* y < 0 */
	mm1 = _mm_and_ps(ym, _mask_sign_raw);
	mm3 = _mm_xor_ps(mm3, mm1);

	return _mm_cvtss_f32(mm3);
}
void
interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
    gint len, const gpointer icp, gint astride)
{
  gint i = 0;
  gint16 *o = op, *a = ap, *ic = icp;
  __m128i ta, tb, tl1, tl2, th1, th2;
  __m128i f[2];
  const gint16 *c[4] = { (gint16 *) ((gint8 *) a + 0 * astride),
    (gint16 *) ((gint8 *) a + 1 * astride),
    (gint16 *) ((gint8 *) a + 2 * astride),
    (gint16 *) ((gint8 *) a + 3 * astride)
  };

  f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]);
  f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]);

  for (; i < len; i += 8) {
    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
    tb = _mm_load_si128 ((__m128i *) (c[1] + i));

    tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]);
    th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]);

    ta = _mm_load_si128 ((__m128i *) (c[2] + i));
    tb = _mm_load_si128 ((__m128i *) (c[3] + i));

    tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]);
    th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]);

    tl1 = _mm_add_epi32 (tl1, tl2);
    th1 = _mm_add_epi32 (th1, th2);

    tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
    th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));

    tl1 = _mm_srai_epi32 (tl1, PRECISION_S16);
    th1 = _mm_srai_epi32 (th1, PRECISION_S16);

    tl1 = _mm_packs_epi32 (tl1, th1);
    _mm_store_si128 ((__m128i *) (o + i), tl1);
  }
}
Exemplo n.º 26
0
__m64 _m_psradi(__m64 _M, int _Count)
{
    __m128i lhs = {0};
    lhs.m128i_i64[0] = _M.m64_i64;

    lhs = _mm_srai_epi32(lhs, _Count);

    _M.m64_i64 = lhs.m128i_i64[0];
    return _M;
}
Exemplo n.º 27
0
Arquivo: dsp.cpp Projeto: taqu/opus
    void conv_Short2ToShort1(void* dst, const void* s, s32 numSamples)
    {
        LSshort* d = reinterpret_cast<LSshort*>(dst);
        const LSshort* src = reinterpret_cast<const LSshort*>(s);

        s32 num = numSamples >> 2; //8個のshortをまとめて処理
        s32 offset = num << 2;
        s32 rem = numSamples - offset;

        const __m128i izero = _mm_setzero_si128();
        __declspec(align(16)) LSshort tmp[8];

        const LSshort* p = src;
        LSshort* q = d;
        for(s32 i=0; i<num; ++i){
            //32bit整数r0, r1に変換
            __m128i t0 = _mm_loadu_si128((const __m128i*)p);
            __m128i t1 = _mm_cmpgt_epi16(izero, t0);
            __m128i r0 = _mm_unpackhi_epi16(t0, t1);
            __m128i r1 = _mm_unpacklo_epi16(t0, t1);

            __m128i r2 = _mm_add_epi32(r0, _mm_shuffle_epi32(r0, _MM_SHUFFLE(2, 3, 0, 1)));
            __m128i r3 = _mm_add_epi32(r1, _mm_shuffle_epi32(r1, _MM_SHUFFLE(2, 3, 0, 1)));

            r2 = _mm_srai_epi32(r2, 1);
            r3 = _mm_srai_epi32(r3, 1);
            __m128i r4 = _mm_packs_epi32(r3, r2);
            _mm_store_si128((__m128i*)tmp, r4);
            q[0] = tmp[0];
            q[1] = tmp[2];
            q[2] = tmp[4];
            q[3] = tmp[6];
            p += 8;
            q += 4;
        }

        for(s32 i=0; i<rem; ++i){
            s32 j = i<<1;
            s32 t = (p[j+0] + p[j+1]) >> 1;
            q[i] = static_cast<LSshort>(t);
        }
    }
Exemplo n.º 28
0
inline __m128i LOAD_QUANTISED(const int32_t *idata, const QuantisationMatrix *qmatrix, const int l, const int s) {
  __m128i D  = _mm_load_si128((__m128i *)idata);
  __m128i QF = _mm_load_si128((__m128i *)&qmatrix->qfactor[l][s]);
  __m128i QO = _mm_load_si128((__m128i *)&qmatrix->qoffset[l][s]);
  __m128i X  = _mm_abs_epi32(D);
  X = _mm_mullo_epi32(X, QF);
  X = _mm_add_epi32(X, QO);
  X = _mm_srai_epi32(X, 2);
  X = _mm_sign_epi32(X, D);
  return X;
}
Exemplo n.º 29
0
//
// This was v_mul_complex16_shift but I changed the name for consistency with v_conj_mul
// and the fact that the old v_mul_complex16 was never called
//
FORCE_INLINE
int __ext_v_mul_complex16(struct complex16* out, int lenout,
                struct complex16* x, int len1,
                struct complex16* y, int len2, int shift)
{
  const unum8 wlen = 4;// sizeof(vcs) / sizeof(complex16);
  const __m128i xmm6 = _mm_set1_epi32(0x0000FFFF);
  const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000);
  const __m128i xmm4 = _mm_set1_epi32(0x00010000);

  __m128i* Xs = (__m128i*) x;
  __m128i* Ys = (__m128i*) y;
  __m128i* Outs = (__m128i*) out;
  for (int i = 0; i < len1 / wlen; i++){
    __m128i mx = _mm_loadu_si128(&Xs[i]);
    __m128i my = _mm_loadu_si128(&Ys[i]);

    __m128i ms1 = _mm_xor_si128(mx, xmm5);
    ms1 = _mm_add_epi32(ms1, xmm4);

    __m128i ms2 = _mm_shufflehi_epi16(mx, _MM_SHUFFLE(2, 3, 0, 1));
    ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));

    __m128i mre = _mm_srai_epi32(_mm_madd_epi16(ms1, my), shift);
    __m128i mim = _mm_srai_epi32(_mm_madd_epi16(ms2, my), shift);

    mre = _mm_and_si128(mre,xmm6);
    mim = _mm_and_si128(mim,xmm6);

    mim = _mm_slli_epi32(mim,0x10);

    _mm_storeu_si128(&Outs[i], _mm_or_si128(mre, mim));
  }

  for (int i = (len1 / wlen) * wlen; i < len1; i++){
    out[i].re = (x[i].re * y[i].re - x[i].im * y[i].im) >> shift;
    out[i].im = (x[i].re * y[i].im + x[i].im * y[i].re) >> shift;
  }

  return 0;
}
Exemplo n.º 30
0
void f0r_update(f0r_instance_t instance, double time, const uint32_t *inframe, uint32_t *outframe)
{
	assert(instance);
	colgate_instance_t *inst = (colgate_instance_t *)instance;
	unsigned len = inst->width * inst->height;
	unsigned char *dst = (unsigned char *)outframe;
	const unsigned char *src = (unsigned char *)inframe;
	unsigned i;

#ifdef __SSE2__
	__m128i zero = _mm_setzero_si128();
	__m128i max = _mm_set1_epi16(REVERSE_LUT_SIZE - 1);
	for (i = 0; i < len; ++i) {
		__m128i l1 = inst->premult_r[*src++];
		__m128i l2 = inst->premult_g[*src++];
		__m128i l3 = inst->premult_b[*src++];
		__m128i result = _mm_add_epi32(l3, _mm_add_epi32(l1, l2));

		// Shift into the right range, and then clamp to [min, max].
		// We convert to 16-bit values since we have min/max instructions
		// there (without needing SSE4), and because it allows us
		// to extract the values with one less SSE shift/move.
		result = _mm_srai_epi32(result, INPUT_PIXEL_BITS + MATRIX_ELEMENT_FRAC_BITS - REVERSE_LUT_BITS);
		result = _mm_packs_epi32(result, result);
		result = _mm_max_epi16(result, zero);
		result = _mm_min_epi16(result, max);

		unsigned new_rg = _mm_cvtsi128_si32(result);
		result = _mm_srli_si128(result, 4);
		unsigned new_b = _mm_cvtsi128_si32(result);

		*dst++ = linear_rgb_to_srgb_lut[new_rg & 0xffff];
		*dst++ = linear_rgb_to_srgb_lut[new_rg >> 16];
		*dst++ = linear_rgb_to_srgb_lut[new_b];
		*dst++ = *src++;  // Copy alpha.
	}
#else
	for (i = 0; i < len; ++i) {
		unsigned old_r = *src++;
		unsigned old_g = *src++;
		unsigned old_b = *src++;

		int new_r = inst->premult_r[old_r][0] + inst->premult_g[old_g][0] + inst->premult_b[old_b][0];
		int new_g = inst->premult_r[old_r][1] + inst->premult_g[old_g][1] + inst->premult_b[old_b][1];
		int new_b = inst->premult_r[old_r][2] + inst->premult_g[old_g][2] + inst->premult_b[old_b][2];

		*dst++ = convert_linear_rgb_to_srgb_fp(new_r);
		*dst++ = convert_linear_rgb_to_srgb_fp(new_g);
		*dst++ = convert_linear_rgb_to_srgb_fp(new_b);
		*dst++ = *src++;  // Copy alpha.
	}
#endif
}