Ejemplo n.º 1
0
int add_real_vector64_scalar(short *x, 
			     long long int a, 
			     short *y, 
			     unsigned int N)
{
  unsigned int i;                 // loop counter

  __m128i *x_128; 
  __m128i *y_128; 

  x_128 = (__m128i *)&x[0];
  y_128 = (__m128i *)&y[0];

  alpha_128 = _mm_set1_epi64((__m64) a);

  // we compute 4 cpx multiply for each loop
  for(i=0;i<(N>>3);i++)
  {
    y_128[0] = _mm_add_epi64(alpha_128, x_128[0]);
    y_128[1] = _mm_add_epi64(alpha_128, x_128[1]);
    y_128[2] = _mm_add_epi64(alpha_128, x_128[2]);
    y_128[3] = _mm_add_epi64(alpha_128, x_128[3]);


    x_128+=4;
    y_128+=4;

  }
  return(0);
}
Ejemplo n.º 2
0
static WEBP_INLINE void ProcessRow(const __m128i* const A0,
                                   const __m128i* const A1,
                                   const __m128i* const A2,
                                   const __m128i* const A3,
                                   const __m128i* const mult,
                                   uint8_t* const dst) {
  const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
  const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
  const __m128i B0 = _mm_mul_epu32(*A0, *mult);
  const __m128i B1 = _mm_mul_epu32(*A1, *mult);
  const __m128i B2 = _mm_mul_epu32(*A2, *mult);
  const __m128i B3 = _mm_mul_epu32(*A3, *mult);
  const __m128i C0 = _mm_add_epi64(B0, rounder);
  const __m128i C1 = _mm_add_epi64(B1, rounder);
  const __m128i C2 = _mm_add_epi64(B2, rounder);
  const __m128i C3 = _mm_add_epi64(B3, rounder);
  const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX);
  const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX);
  const __m128i D2 = _mm_and_si128(C2, mask);
  const __m128i D3 = _mm_and_si128(C3, mask);
  const __m128i E0 = _mm_or_si128(D0, D2);
  const __m128i E1 = _mm_or_si128(D1, D3);
  const __m128i F = _mm_packs_epi32(E0, E1);
  const __m128i G = _mm_packus_epi16(F, F);
  _mm_storel_epi64((__m128i*)dst, G);
}
int64_t vp9_block_error_avx2(const int16_t *coeff,
                             const int16_t *dqcoeff,
                             intptr_t block_size,
                             int64_t *ssz) {
  __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
  __m256i sse_reg_64hi, ssz_reg_64hi;
  __m128i sse_reg128, ssz_reg128;
  int64_t sse;
  int i;
  const __m256i zero_reg = _mm256_set1_epi16(0);

  // init sse and ssz registerd to zero
  sse_reg = _mm256_set1_epi16(0);
  ssz_reg = _mm256_set1_epi16(0);

  for (i = 0 ; i < block_size ; i+= 16) {
    // load 32 bytes from coeff and dqcoeff
    coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
    dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
    // dqcoeff - coeff
    dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
    // madd (dqcoeff - coeff)
    dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
    // madd coeff
    coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
    // expand each double word of madd (dqcoeff - coeff) to quad word
    exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
    exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
    // expand each double word of madd (coeff) to quad word
    exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
    exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
    // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
  }
  // save the higher 64 bit of each 128 bit lane
  sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
  ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
  // add the higher 64 bit to the low 64 bit
  sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
  ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);

  // add each 64 bit from each of the 128 bit lane of the 256 bit
  sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
                             _mm256_extractf128_si256(sse_reg, 1));

  ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
                             _mm256_extractf128_si256(ssz_reg, 1));

  // store the results
  _mm_storel_epi64((__m128i*)(&sse), sse_reg128);

  _mm_storel_epi64((__m128i*)(ssz), ssz_reg128);
  return sse;
}
Ejemplo n.º 4
0
void Convert444to420(LPBYTE input, int width, int pitch, int height, int startY, int endY, LPBYTE *output, bool bSSE2Available)
{
    LPBYTE lumPlane     = output[0];
    LPBYTE uPlane       = output[1];
    LPBYTE vPlane       = output[2];
    int  chrPitch       = width>>1;

    if(bSSE2Available)
    {
        __m128i lumMask = _mm_set1_epi32(0x0000FF00);
        __m128i uvMask = _mm_set1_epi16(0x00FF);

        for(int y=startY; y<endY; y+=2)
        {
            int yPos    = y*pitch;
            int chrYPos = ((y>>1)*chrPitch);
            int lumYPos = y*width;

            for(int x=0; x<width; x+=4)
            {
                LPBYTE lpImagePos = input+yPos+(x*4);
                int chrPos  = chrYPos + (x>>1);
                int lumPos0 = lumYPos + x;
                int lumPos1 = lumPos0+width;

                __m128i line1 = _mm_load_si128((__m128i*)lpImagePos);
                __m128i line2 = _mm_load_si128((__m128i*)(lpImagePos+pitch));

                //pack lum vals
                {
                    __m128i packVal = _mm_packs_epi32(_mm_srli_si128(_mm_and_si128(line1, lumMask), 1), _mm_srli_si128(_mm_and_si128(line2, lumMask), 1));
                    packVal = _mm_packus_epi16(packVal, packVal);

                    *(LPUINT)(lumPlane+lumPos0) = packVal.m128i_u32[0];
                    *(LPUINT)(lumPlane+lumPos1) = packVal.m128i_u32[1];
                }

                //do average, pack UV vals
                {
                    __m128i addVal = _mm_add_epi64(_mm_and_si128(line1, uvMask), _mm_and_si128(line2, uvMask));
                    __m128i avgVal = _mm_srai_epi16(_mm_add_epi64(addVal, _mm_shuffle_epi32(addVal, _MM_SHUFFLE(2, 3, 0, 1))), 2);
                    avgVal = _mm_shuffle_epi32(avgVal, _MM_SHUFFLE(3, 1, 2, 0));
                    avgVal = _mm_shufflelo_epi16(avgVal, _MM_SHUFFLE(3, 1, 2, 0));
                    avgVal = _mm_packus_epi16(avgVal, avgVal);

                    DWORD packedVals = avgVal.m128i_u32[0];

                    *(LPWORD)(uPlane+chrPos) = WORD(packedVals);
                    *(LPWORD)(vPlane+chrPos) = WORD(packedVals>>16);
                }
            }
        }
    }
    else
    {
#ifdef _WIN64
        for(int y=startY; y<endY; y+=2)
Ejemplo n.º 5
0
/**
 * Processes two doubles at a time
 */
int
_mandelbrot_2( double const * const c_re_arg, 
	           double const * const c_im_arg, 
	           int                  max_iter 
	         )
{
	__m128d z_re = _mm_load_pd(c_re_arg);
	__m128d z_im = _mm_load_pd(c_im_arg);
	__m128d y_re;
	__m128d y_im;
	__m128d c_re = z_re;
	__m128d c_im = z_im;

	__m128i count = _mm_set1_epi64x(0);

	__m128d md;
	__m128d mt;
	__m128i mi = _mm_set1_epi16(0xffff);;

	__m128d two = _mm_set1_pd(2.0);
	__m128i one = _mm_set1_epi64x(1);

	for (int i = 0; i<max_iter; i+=1)
	{
		// y = z .* z;
		y_re = _mm_mul_pd(z_re, z_re);
		y_im = _mm_mul_pd(z_im, z_im);

		// y = z * z;
		y_re = _mm_sub_pd(y_re, y_im);
		y_im = _mm_mul_pd(z_re, z_im);
		y_im = _mm_add_pd(y_im, y_im);

		// z = z * z + c
		z_re = _mm_add_pd(y_re, c_re);
		z_im = _mm_add_pd(y_im, c_im);

		// if condition
		// md = _mm_add_pd(z_re, z_im);
		// md = _mm_cmplt_pd(md, four);
		md = _mm_cmplt_pd(z_re, two);
		mt = _mm_cmplt_pd(z_im, two);
		md = _mm_and_pd(md, mt);
		mi = _mm_and_si128(mi, (__m128i) md);
		// PRINT_M128I(mi);
		if ( !_mm_movemask_pd(md) ) { break; }

		// count iterations
		count = _mm_add_epi64( count, _mm_and_si128( mi, one) );
	}

	int val;
	count = _mm_add_epi64( _mm_srli_si128(count, 8), count );
	val   = _mm_cvtsi128_si64( count );

	return val;
}
Ejemplo n.º 6
0
static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
  __m128i v_acc0_q = _mm_setzero_si128();
  __m128i v_acc1_q = _mm_setzero_si128();

  const int16_t *const end = src + n;

  assert(n % 64 == 0);

  while (src < end) {
    const __m128i v_val_0_w = xx_load_128(src);
    const __m128i v_val_1_w = xx_load_128(src + 8);
    const __m128i v_val_2_w = xx_load_128(src + 16);
    const __m128i v_val_3_w = xx_load_128(src + 24);
    const __m128i v_val_4_w = xx_load_128(src + 32);
    const __m128i v_val_5_w = xx_load_128(src + 40);
    const __m128i v_val_6_w = xx_load_128(src + 48);
    const __m128i v_val_7_w = xx_load_128(src + 56);

    const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
    const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
    const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
    const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
    const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
    const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
    const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
    const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);

    const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
    const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
    const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
    const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);

    const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
    const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);

    const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d);

    v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q));
    v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32));

    src += 64;
  }

  v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
  v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));

#if ARCH_X86_64
  return (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
#else
  {
    uint64_t tmp;
    _mm_storel_epi64((__m128i *)&tmp, v_acc0_q);
    return tmp;
  }
#endif
}
/**
*******************************************************************************
*
* @brief
*  Compute 8x4 SAD
*
* @par   Description
*  Compute 8x4 sum of absolute differences between source and reference block
*
* @param[in] pu1_src
*  Source buffer
*
* @param[in] pu1_ref
*  Reference buffer
*
* @param[in] src_strd
*  Source stride
*
* @param[in] ref_strd
*  Reference stride
*
* @param[in] wd
*  Assumed to be 8
*
* @param[in] ht
*  Assumed to be 4

* @returns
*  SAD
*
* @remarks
*
*******************************************************************************
*/
WORD32 icv_sad_8x4_ssse3(UWORD8 *pu1_src,
                         UWORD8 *pu1_ref,
                         WORD32 src_strd,
                         WORD32 ref_strd,
                         WORD32 wd,
                         WORD32 ht)
{
    WORD32 sad;
    __m128 src_r0, src_r1;
    __m128 ref_r0, ref_r1;
    __m128i res_r0, res_r1;

    UNUSED(wd);
    UNUSED(ht);
    ASSERT(wd == 8);
    ASSERT(ht == 4);

    /* Load source */
    src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
    pu1_src += src_strd;

    src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
    pu1_src += src_strd;

    src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src));
    pu1_src += src_strd;

    src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src));
    pu1_src += src_strd;


    /* Load reference */
    ref_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_ref));
    pu1_ref += ref_strd;

    ref_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_ref));
    pu1_ref += ref_strd;

    ref_r0 = _mm_loadh_pi (ref_r0, (__m64 *) (pu1_ref));
    pu1_ref += ref_strd;

    ref_r1 = _mm_loadh_pi (ref_r1, (__m64 *) (pu1_ref));
    pu1_ref += ref_strd;

    /* Compute SAD for each row */
    res_r0 = _mm_sad_epu8((__m128i)src_r0, (__m128i)ref_r0);
    res_r1 = _mm_sad_epu8((__m128i)src_r1, (__m128i)ref_r1);

    /* Accumulate SAD */
    res_r0 = _mm_add_epi64(res_r0,  res_r1);
    res_r0 = _mm_add_epi64(res_r0, _mm_srli_si128(res_r0, 8));

    sad  = _mm_cvtsi128_si32(res_r0);

    return sad;
}
Ejemplo n.º 8
0
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
                                         const uint8_t* src) {
  const int x_sub = wrk->x_sub;
  int accum = 0;
  const __m128i zero = _mm_setzero_si128();
  const __m128i mult0 = _mm_set1_epi16(x_sub);
  const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale);
  const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
  __m128i sum = zero;
  rescaler_t* frow = wrk->frow;
  const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;

  if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) {
    WebPRescalerImportRowShrink_C(wrk, src);
    return;
  }
  assert(!WebPRescalerInputDone(wrk));
  assert(!wrk->x_expand);

  for (; frow < frow_end; frow += 4) {
    __m128i base = zero;
    accum += wrk->x_add;
    while (accum > 0) {
      const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src));
      src += 4;
      base = _mm_unpacklo_epi8(A, zero);
      // To avoid overflow, we need: base * x_add / x_sub < 32768
      // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit.
      sum = _mm_add_epi16(sum, base);
      accum -= x_sub;
    }
    {    // Emit next horizontal pixel.
      const __m128i mult = _mm_set1_epi16(-accum);
      const __m128i frac0 = _mm_mullo_epi16(base, mult);  // 16b x 16b -> 32b
      const __m128i frac1 = _mm_mulhi_epu16(base, mult);
      const __m128i frac = _mm_unpacklo_epi16(frac0, frac1);  // frac is 32b
      const __m128i A0 = _mm_mullo_epi16(sum, mult0);
      const __m128i A1 = _mm_mulhi_epu16(sum, mult0);
      const __m128i B0 = _mm_unpacklo_epi16(A0, A1);      // sum * x_sub
      const __m128i frow_out = _mm_sub_epi32(B0, frac);   // sum * x_sub - frac
      const __m128i D0 = _mm_srli_epi64(frac, 32);
      const __m128i D1 = _mm_mul_epu32(frac, mult1);      // 32b x 16b -> 64b
      const __m128i D2 = _mm_mul_epu32(D0, mult1);
      const __m128i E1 = _mm_add_epi64(D1, rounder);
      const __m128i E2 = _mm_add_epi64(D2, rounder);
      const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2));
      const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2));
      const __m128i G = _mm_unpacklo_epi32(F1, F2);
      sum = _mm_packs_epi32(G, zero);
      _mm_storeu_si128((__m128i*)frow, frow_out);
    }
  }
  assert(accum == 0);
}
Ejemplo n.º 9
0
static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
  int x_out;
  uint8_t* const dst = wrk->dst;
  rescaler_t* const irow = wrk->irow;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
  const rescaler_t* const frow = wrk->frow;
  const __m128i mult = _mm_set_epi32(0, wrk->fy_scale, 0, wrk->fy_scale);

  assert(!WebPRescalerOutputDone(wrk));
  assert(wrk->y_accum <= 0 && wrk->y_sub + wrk->y_accum >= 0);
  assert(wrk->y_expand);
  if (wrk->y_accum == 0) {
    for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
      __m128i A0, A1, A2, A3;
      LoadDispatchAndMult(frow + x_out, NULL, &A0, &A1, &A2, &A3);
      ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
    }
    for (; x_out < x_out_max; ++x_out) {
      const uint32_t J = frow[x_out];
      const int v = (int)MULT_FIX(J, wrk->fy_scale);
      assert(v >= 0 && v <= 255);
      dst[x_out] = v;
    }
  } else {
    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
    const __m128i mA = _mm_set_epi32(0, A, 0, A);
    const __m128i mB = _mm_set_epi32(0, B, 0, B);
    const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
    for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
      __m128i A0, A1, A2, A3, B0, B1, B2, B3;
      LoadDispatchAndMult(frow + x_out, &mA, &A0, &A1, &A2, &A3);
      LoadDispatchAndMult(irow + x_out, &mB, &B0, &B1, &B2, &B3);
      {
        const __m128i C0 = _mm_add_epi64(A0, B0);
        const __m128i C1 = _mm_add_epi64(A1, B1);
        const __m128i C2 = _mm_add_epi64(A2, B2);
        const __m128i C3 = _mm_add_epi64(A3, B3);
        const __m128i D0 = _mm_add_epi64(C0, rounder);
        const __m128i D1 = _mm_add_epi64(C1, rounder);
        const __m128i D2 = _mm_add_epi64(C2, rounder);
        const __m128i D3 = _mm_add_epi64(C3, rounder);
        const __m128i E0 = _mm_srli_epi64(D0, WEBP_RESCALER_RFIX);
        const __m128i E1 = _mm_srli_epi64(D1, WEBP_RESCALER_RFIX);
        const __m128i E2 = _mm_srli_epi64(D2, WEBP_RESCALER_RFIX);
        const __m128i E3 = _mm_srli_epi64(D3, WEBP_RESCALER_RFIX);
        ProcessRow(&E0, &E1, &E2, &E3, &mult, dst + x_out);
      }
    }
    for (; x_out < x_out_max; ++x_out) {
      const uint64_t I = (uint64_t)A * frow[x_out]
                       + (uint64_t)B * irow[x_out];
      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
      const int v = (int)MULT_FIX(J, wrk->fy_scale);
      assert(v >= 0 && v <= 255);
      dst[x_out] = v;
    }
  }
}
Ejemplo n.º 10
0
opus_int64 silk_inner_prod16_aligned_64_sse4_1(
    const opus_int16            *inVec1,            /*    I input vector 1                                              */
    const opus_int16            *inVec2,            /*    I input vector 2                                              */
    const opus_int              len                 /*    I vector lengths                                              */
)
{
    opus_int  i, dataSize8;
    opus_int64 sum;

    __m128i xmm_tempa;
    __m128i inVec1_76543210, acc1;
    __m128i inVec2_76543210, acc2;

    sum = 0;
    dataSize8 = len & ~7;

    acc1 = _mm_setzero_si128();
    acc2 = _mm_setzero_si128();

    for( i = 0; i < dataSize8; i += 8 ) {
        inVec1_76543210 = _mm_loadu_si128( (__m128i*)(&inVec1[i + 0] ) );
        inVec2_76543210 = _mm_loadu_si128( (__m128i*)(&inVec2[i + 0] ) );

        /* only when all 4 operands are -32768 (0x8000), this results in wrap around */
        inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 );

        xmm_tempa       = _mm_cvtepi32_epi64( inVec1_76543210 );
        /* equal shift right 8 bytes */
        inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) );
        inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 );

        acc1 = _mm_add_epi64( acc1, xmm_tempa );
        acc2 = _mm_add_epi64( acc2, inVec1_76543210 );
    }

    acc1 = _mm_add_epi64( acc1, acc2 );

    /* equal shift right 8 bytes */
    acc2 = _mm_shuffle_epi32( acc1, _MM_SHUFFLE( 0, 0, 3, 2 ) );
    acc1 = _mm_add_epi64( acc1, acc2 );

    _mm_storel_epi64( (__m128i *)&sum, acc1 );

    for( ; i < len; i++ ) {
        sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
    }

    return sum;
}
Ejemplo n.º 11
0
Archivo: ars.hpp Proyecto: zhouyan/vSMC
 void generate(std::array<M128I<U>, Rp1> &rk,
               const M128I<std::uint64_t> &weyl, std::true_type) const
 {
     std::get<N>(rk) =
         _mm_add_epi64(std::get<N - 1>(rk).value(), weyl.value());
     generate<N + 1>(rk, weyl, std::integral_constant<bool, N + 1 < Rp1>());
 }
Ejemplo n.º 12
0
void*drawman(void*x){
	int c=col++;
	unsigned _m=mx,mxx=16777216/_m;
	double _x=xx,_y=yy,_w=wh;
	do{
		__m128d cr=_mm_set1_pd(_x+_w*c);
		for(int j=0;j<512;j+=2){
			__m128d zr=cr,
				zi=_mm_set_pd(_y+_w*j,_y+_w*(j+1)),ci=zi,
				zr2=_mm_mul_pd(zr,zr),zi2=_mm_mul_pd(zi,zi);
			unsigned mk=mx-1;
			uint64_t kk[2]__attribute__((aligned(16)))={mk,mk};
			__m128i k=_mm_load_si128((__m128i*)kk);
			do{
				zi=_mm_mul_pd(zi,zr);
				zi=_mm_add_pd(_mm_add_pd(zi,zi),ci);
				zr=_mm_add_pd(_mm_sub_pd(zr2,zi2),cr);
				zr2=_mm_mul_pd(zr,zr);
				zi2=_mm_mul_pd(zi,zi);
				__m128d n=_mm_cmplt_pd(_mm_add_pd(zr2,zi2),_mm_set1_pd(4));
				if(!_mm_movemask_pd(n))break;
				k=_mm_add_epi64(k,_mm_castpd_si128(n));
			}while(--mk);
			_mm_store_si128((__m128i*)kk,k);
			manor[c][j]=kk[1]*mxx>>16;
			manor[c][j+1]=kk[0]*mxx>>16;
		}
		done[c>>6]|=1ULL<<(c&63);
		c=col++;
	}while(c<512&&!pull);
}
Ejemplo n.º 13
0
		template <bool align> void SquaredDifferenceSum(
			const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, 
			size_t width, size_t height, uint64_t * sum)
		{
			assert(width < 0x10000);
			if(align)
			{
				assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride));
			}

			size_t bodyWidth = AlignLo(width, A);
			__m128i tailMask = ShiftLeft(K_INV_ZERO, A - width + bodyWidth);
			__m128i fullSum = _mm_setzero_si128();
			for(size_t row = 0; row < height; ++row)
			{
				__m128i rowSum = _mm_setzero_si128();
				for(size_t col = 0; col < bodyWidth; col += A)
				{
					const __m128i a_ = Load<align>((__m128i*)(a + col));
					const __m128i b_ = Load<align>((__m128i*)(b + col)); 
					rowSum = _mm_add_epi32(rowSum, SquaredDifference(a_, b_));
				}
				if(width - bodyWidth)
				{
					const __m128i a_ = _mm_and_si128(tailMask, Load<false>((__m128i*)(a + width - A)));
					const __m128i b_ = _mm_and_si128(tailMask, Load<false>((__m128i*)(b + width - A))); 
					rowSum = _mm_add_epi32(rowSum, SquaredDifference(a_, b_));
				}
				fullSum = _mm_add_epi64(fullSum, HorizontalSum32(rowSum));
				a += aStride;
				b += bStride;
			}
			*sum = ExtractInt64Sum(fullSum);
		}
Ejemplo n.º 14
0
__m128i test_mm_add_epi64(__m128i A, __m128i B) {
  // DAG-LABEL: test_mm_add_epi64
  // DAG: add <2 x i64>
  //
  // ASM-LABEL: test_mm_add_epi64
  // ASM: paddq
  return _mm_add_epi64(A, B);
}
Ejemplo n.º 15
0
void Convert444toNV12(LPBYTE input, int width, int inPitch, int outPitch, int height, int startY, int endY, LPBYTE *output)
{
    LPBYTE lumPlane     = output[0];
    LPBYTE uvPlane		= output[1];

    __m128i lumMask = _mm_set1_epi32(0x0000FF00);
    __m128i uvMask = _mm_set1_epi16(0x00FF);

    for(int y=startY; y<endY; y+=2)
    {
        int yPos    = y*inPitch;
        int uvYPos = (y>>1)*outPitch;
        int lumYPos = y*outPitch;

        for(int x=0; x<width; x+=4)
        {
            LPBYTE lpImagePos = input+yPos+(x*4);
            int uvPos  = uvYPos + x;
            int lumPos0 = lumYPos + x;
            int lumPos1 = lumPos0 + outPitch;

            __m128i line1 = _mm_load_si128((__m128i*)lpImagePos);
            __m128i line2 = _mm_load_si128((__m128i*)(lpImagePos+inPitch));

            //pack lum vals
            {
                __m128i packVal = _mm_packs_epi32(_mm_srli_si128(_mm_and_si128(line1, lumMask), 1), _mm_srli_si128(_mm_and_si128(line2, lumMask), 1));
                packVal = _mm_packus_epi16(packVal, packVal);

                *(LPUINT)(lumPlane+lumPos0) = packVal.m128i_u32[0];
                *(LPUINT)(lumPlane+lumPos1) = packVal.m128i_u32[1];
            }

            //do average, pack UV vals
            {
                __m128i addVal = _mm_add_epi64(_mm_and_si128(line1, uvMask), _mm_and_si128(line2, uvMask));
                __m128i avgVal = _mm_srai_epi16(_mm_add_epi64(addVal, _mm_shuffle_epi32(addVal, _MM_SHUFFLE(2, 3, 0, 1))), 2);
                avgVal = _mm_shuffle_epi32(avgVal, _MM_SHUFFLE(3, 1, 2, 0));

                *(LPUINT)(uvPlane+uvPos) = _mm_packus_epi16(avgVal, avgVal).m128i_u32[0];
            }
        }
    }
}
Ejemplo n.º 16
0
/*
 * mixed endian increment, low 64bits stored in hi word to be compatible
 * with _icm's BSWAP.
 */
static inline __m128i
nextc(__m128i x)
{
	const __m128i ONE = _mm_setr_epi32(0, 0, 1, 0);
	const __m128i ZERO = _mm_setzero_si128();

	x = _mm_add_epi64(x, ONE);
	__m128i t = _mm_cmpeq_epi64(x, ZERO);
	t = _mm_unpackhi_epi64(t, ZERO);
	x = _mm_sub_epi64(x, t);

	return x;
}
Ejemplo n.º 17
0
unsigned int luma_sse2(const uint8_t *pSrc, intptr_t nSrcPitch) {
    __m128i sum = zeroes;

    for (unsigned y = 0; y < height; y++) {
        for (unsigned x = 0; x < width; x += 16) {
            __m128i src;
            if (width == 4)
                src = _mm_cvtsi32_si128(*(const int *)pSrc);
            else if (width == 8)
                src = _mm_loadl_epi64((const __m128i *)pSrc);
            else
                src = _mm_loadu_si128((const __m128i *)&pSrc[x]);

            sum = _mm_add_epi64(sum, _mm_sad_epu8(src, zeroes));
        }

        pSrc += nSrcPitch;
    }

    if (width >= 16)
        sum = _mm_add_epi64(sum, _mm_srli_si128(sum, 8));

    return (unsigned)_mm_cvtsi128_si32(sum);
}
Ejemplo n.º 18
0
			inline void MoveNext(int s) {
				if(s == 0) return;
				else if(s < 4){
					auto v = uv, st = uvStep;
					while(s--) {
						v = _mm_add_epi64(v, st);
					}
					uv = v;
				}
				else {
					// no SSE2 support for 64bit multiply, but
					// this isn't a big problem because this case is rare
					uvU += stepU * s;
					uvV += stepV * s;
				}
			}
Ejemplo n.º 19
0
SSE_FUNCTION static void
sad8x8_u8_sse (uint32_t *dest, uint8_t *src1, int sstr1, uint8_t *src2,
    int sstr2)
{
  int i;
  __m128i sum = _mm_setzero_si128();
  union m128_int sumi;

  for (i = 0; i < 4; i++) {
    __m128i xmm0, xmm1, xmm2, xmm3;
    xmm0 = _mm_loadl_epi64((__m128i *)src1);
    xmm1 = _mm_loadl_epi64((__m128i *)(src1 + sstr1));
    xmm2 = _mm_loadl_epi64((__m128i *)src2);
    xmm3 = _mm_loadl_epi64((__m128i *)(src2 + sstr2));
    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
    xmm2 = _mm_unpacklo_epi8(xmm2, xmm3);
    sum = _mm_add_epi64(sum, _mm_sad_epu8(xmm0, xmm2));
    src1 += 2 * sstr1;
    src2 += 2 * sstr2;
  }
  sumi.m128 = sum;
  *dest = sumi.i[0] + sumi.i[2];
}
Ejemplo n.º 20
0
int64_t get_sum_vectorised (int64_t * vector)
{

	__m128i sum = _mm_setzero_si128();
	int64_t actualSum = 0;


	for (int64_t i = 0; i < g_length/4*4; i += 4)
	{
		__m128i temp = _mm_loadu_si128((__m128i *)(vector + i));
		sum = _mm_add_epi64(sum, temp);
	}

	int64_t A[4] = {0,0,0,0};
	_mm_storeu_si128((__m128i *)A, sum);
	actualSum += A[0] + A[1] + A[2] + A[3];

	for (int64_t i = g_length/4*4; i < g_length; i++)
	{
		actualSum += vector[i];
	}

	return actualSum;   
}
Ejemplo n.º 21
0
static inline void
ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
{
	int i;
	uint16_t rx_id;
	volatile union ixgbe_adv_rx_desc *rxdp;
	struct ixgbe_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
	struct rte_mbuf *mb0, *mb1;
	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
			RTE_PKTMBUF_HEADROOM);
	__m128i dma_addr0, dma_addr1;

	const __m128i hba_msk = _mm_set_epi64x(0, UINT64_MAX);

	rxdp = rxq->rx_ring + rxq->rxrearm_start;

	/* Pull 'n' more MBUFs into the software ring */
	if (rte_mempool_get_bulk(rxq->mb_pool,
				 (void *)rxep,
				 RTE_IXGBE_RXQ_REARM_THRESH) < 0) {
		if (rxq->rxrearm_nb + RTE_IXGBE_RXQ_REARM_THRESH >=
		    rxq->nb_rx_desc) {
			dma_addr0 = _mm_setzero_si128();
			for (i = 0; i < RTE_IXGBE_DESCS_PER_LOOP; i++) {
				rxep[i].mbuf = &rxq->fake_mbuf;
				_mm_store_si128((__m128i *)&rxdp[i].read,
						dma_addr0);
			}
		}
		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
			RTE_IXGBE_RXQ_REARM_THRESH;
		return;
	}

	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
	for (i = 0; i < RTE_IXGBE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
		__m128i vaddr0, vaddr1;
		uintptr_t p0, p1;

		mb0 = rxep[0].mbuf;
		mb1 = rxep[1].mbuf;

		/*
		 * Flush mbuf with pkt template.
		 * Data to be rearmed is 6 bytes long.
		 * Though, RX will overwrite ol_flags that are coming next
		 * anyway. So overwrite whole 8 bytes with one load:
		 * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
		 */
		p0 = (uintptr_t)&mb0->rearm_data;
		*(uint64_t *)p0 = rxq->mbuf_initializer;
		p1 = (uintptr_t)&mb1->rearm_data;
		*(uint64_t *)p1 = rxq->mbuf_initializer;

		/* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */
		vaddr0 = _mm_loadu_si128((__m128i *)&(mb0->buf_addr));
		vaddr1 = _mm_loadu_si128((__m128i *)&(mb1->buf_addr));

		/* convert pa to dma_addr hdr/data */
		dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
		dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);

		/* add headroom to pa values */
		dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
		dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);

		/* set Header Buffer Address to zero */
		dma_addr0 =  _mm_and_si128(dma_addr0, hba_msk);
		dma_addr1 =  _mm_and_si128(dma_addr1, hba_msk);

		/* flush desc with pa dma_addr */
		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
	}

	rxq->rxrearm_start += RTE_IXGBE_RXQ_REARM_THRESH;
	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
		rxq->rxrearm_start = 0;

	rxq->rxrearm_nb -= RTE_IXGBE_RXQ_REARM_THRESH;

	rx_id = (uint16_t) ((rxq->rxrearm_start == 0) ?
			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));

	/* Update the tail pointer on the NIC */
	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
}
Ejemplo n.º 22
0
HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
{
	__m128i remainingbits;

	// Add remaining bytes in the buffer
	state->processed_bits += state->uBufferBytes * 8;

	remainingbits = _mm_set_epi32(0, 0, 0, state->uBufferBytes * 8);

	// Pad with 0x80
	state->buffer[state->uBufferBytes++] = 0x80;
	
	// Enough buffer space for padding in this block?
	if((state->uBlockLength - state->uBufferBytes) >= 18)
	{
		// Pad with zeros
		memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18));

		// Hash size
		*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;

		// Processed bits
		*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
		*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;

		// Last block contains message bits?
		if(state->uBufferBytes == 1)
		{
			state->k = _mm_xor_si128(state->k, state->k);
			state->k = _mm_sub_epi64(state->k, state->const1536);
		}
		else
		{
			state->k = _mm_add_epi64(state->k, remainingbits);
			state->k = _mm_sub_epi64(state->k, state->const1536);
		}

		// Compress
		Compress(state, state->buffer, 1);
	}
	else
	{
		// Fill with zero and compress
		memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes);
		state->k = _mm_add_epi64(state->k, remainingbits);
		state->k = _mm_sub_epi64(state->k, state->const1536);
		Compress(state, state->buffer, 1);

		// Last block
		memset(state->buffer, 0, state->uBlockLength - 18);

		// Hash size
		*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;

		// Processed bits
		*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
		*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;

		// Compress the last block
		state->k = _mm_xor_si128(state->k, state->k);
		state->k = _mm_sub_epi64(state->k, state->const1536);
		Compress(state, state->buffer, 1);
	}

	// Store the hash value
	_mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]);
	_mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]);

	if(state->uHashSize == 512)
	{
		_mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]);
		_mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]);
	}

	return SUCCESS;
}
Ejemplo n.º 23
0
void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
{
	unsigned int r, b, i, j;
	__m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
	__m128i _state[4][4], _state2[4][4], _statebackup[4][4]; 


	for(i = 0; i < 4; i++)
		for(j = 0; j < ctx->uHashSize / 256; j++)
			_state[i][j] = ctx->state[i][j];


#if HAVE_AES_NI
	// transform cv
	for(i = 0; i < 4; i++)
		for(j = 0; j < ctx->uHashSize / 256; j++)
		{
			TRANSFORM(_state[i][j], _k_ipt, t1, t2);
		}
#endif

	for(b = 0; b < uBlockCount; b++)
	{
		ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);

		// load message
		for(j = ctx->uHashSize / 256; j < 4; j++)
		{
			for(i = 0; i < 4; i++)
			{
				_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);

#if HAVE_AES_NI
				// transform message
				TRANSFORM(_state[i][j], _k_ipt, t1, t2);
#endif
			}
		}

		// save state
		SAVESTATE(_statebackup, _state);


		k1 = ctx->k;

#if HAVE_AES_NI
		for(r = 0; r < ctx->uRounds / 2; r++)
		{
			ECHO_ROUND_UNROLL2;
		}

#else
		for(r = 0; r < ctx->uRounds / 2; r++)
		{
			_state2[0][0] = M128(zero); _state2[1][0] = M128(zero); _state2[2][0] = M128(zero); _state2[3][0] = M128(zero);
			_state2[0][1] = M128(zero); _state2[1][1] = M128(zero); _state2[2][1] = M128(zero); _state2[3][1] = M128(zero);
			_state2[0][2] = M128(zero); _state2[1][2] = M128(zero); _state2[2][2] = M128(zero); _state2[3][2] = M128(zero);
			_state2[0][3] = M128(zero); _state2[1][3] = M128(zero); _state2[2][3] = M128(zero); _state2[3][3] = M128(zero);																			

			ECHO_SUB_AND_MIX(_state, 0, 0, _state2, 0, 0, 1, 2, 3);
			ECHO_SUB_AND_MIX(_state, 1, 0, _state2, 3, 1, 2, 3, 0);
			ECHO_SUB_AND_MIX(_state, 2, 0, _state2, 2, 2, 3, 0, 1);
			ECHO_SUB_AND_MIX(_state, 3, 0, _state2, 1, 3, 0, 1, 2);
			ECHO_SUB_AND_MIX(_state, 0, 1, _state2, 1, 0, 1, 2, 3);
			ECHO_SUB_AND_MIX(_state, 1, 1, _state2, 0, 1, 2, 3, 0);
			ECHO_SUB_AND_MIX(_state, 2, 1, _state2, 3, 2, 3, 0, 1);
			ECHO_SUB_AND_MIX(_state, 3, 1, _state2, 2, 3, 0, 1, 2);
			ECHO_SUB_AND_MIX(_state, 0, 2, _state2, 2, 0, 1, 2, 3);
			ECHO_SUB_AND_MIX(_state, 1, 2, _state2, 1, 1, 2, 3, 0);
			ECHO_SUB_AND_MIX(_state, 2, 2, _state2, 0, 2, 3, 0, 1);
			ECHO_SUB_AND_MIX(_state, 3, 2, _state2, 3, 3, 0, 1, 2);
			ECHO_SUB_AND_MIX(_state, 0, 3, _state2, 3, 0, 1, 2, 3);
			ECHO_SUB_AND_MIX(_state, 1, 3, _state2, 2, 1, 2, 3, 0);
			ECHO_SUB_AND_MIX(_state, 2, 3, _state2, 1, 2, 3, 0, 1);
			ECHO_SUB_AND_MIX(_state, 3, 3, _state2, 0, 3, 0, 1, 2);

			_state[0][0] = M128(zero); _state[1][0] = M128(zero); _state[2][0] = M128(zero); _state[3][0] = M128(zero);
			_state[0][1] = M128(zero); _state[1][1] = M128(zero); _state[2][1] = M128(zero); _state[3][1] = M128(zero);
			_state[0][2] = M128(zero); _state[1][2] = M128(zero); _state[2][2] = M128(zero); _state[3][2] = M128(zero);
			_state[0][3] = M128(zero); _state[1][3] = M128(zero); _state[2][3] = M128(zero); _state[3][3] = M128(zero);																			

			ECHO_SUB_AND_MIX(_state2, 0, 0, _state, 0, 0, 1, 2, 3);
			ECHO_SUB_AND_MIX(_state2, 1, 0, _state, 3, 1, 2, 3, 0);
			ECHO_SUB_AND_MIX(_state2, 2, 0, _state, 2, 2, 3, 0, 1);
			ECHO_SUB_AND_MIX(_state2, 3, 0, _state, 1, 3, 0, 1, 2);
			ECHO_SUB_AND_MIX(_state2, 0, 1, _state, 1, 0, 1, 2, 3);
			ECHO_SUB_AND_MIX(_state2, 1, 1, _state, 0, 1, 2, 3, 0);
			ECHO_SUB_AND_MIX(_state2, 2, 1, _state, 3, 2, 3, 0, 1);
			ECHO_SUB_AND_MIX(_state2, 3, 1, _state, 2, 3, 0, 1, 2);
			ECHO_SUB_AND_MIX(_state2, 0, 2, _state, 2, 0, 1, 2, 3);
			ECHO_SUB_AND_MIX(_state2, 1, 2, _state, 1, 1, 2, 3, 0);
			ECHO_SUB_AND_MIX(_state2, 2, 2, _state, 0, 2, 3, 0, 1);
			ECHO_SUB_AND_MIX(_state2, 3, 2, _state, 3, 3, 0, 1, 2);
			ECHO_SUB_AND_MIX(_state2, 0, 3, _state, 3, 0, 1, 2, 3);
			ECHO_SUB_AND_MIX(_state2, 1, 3, _state, 2, 1, 2, 3, 0);
			ECHO_SUB_AND_MIX(_state2, 2, 3, _state, 1, 2, 3, 0, 1);
			ECHO_SUB_AND_MIX(_state2, 3, 3, _state, 0, 3, 0, 1, 2);

		}
#endif

		
		if(ctx->uHashSize == 256)
		{
			for(i = 0; i < 4; i++)
			{
				_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
				_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
				_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);

				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
			}
		}
		else
		{
			for(i = 0; i < 4; i++)
			{
				_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
				_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);

				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);

				_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
				_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
			}
		}

		pmsg += ctx->uBlockLength;
	}

#if HAVE_AES_NI
	// transform state
	for(i = 0; i < 4; i++)
		for(j = 0; j < 4; j++)
		{
			TRANSFORM(_state[i][j], _k_opt, t1, t2);
		}
#endif

		SAVESTATE(ctx->state, _state);

}
Ejemplo n.º 24
0
uint32_t FLAC__fixed_compute_best_predictor_wide_intrin_sse2(const FLAC__int32 data[], uint32_t data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1])
{
	FLAC__uint64 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4;
	uint32_t i, order;

	__m128i total_err0, total_err1, total_err3;

	{
		FLAC__int32 itmp;
		__m128i last_error, zero = _mm_setzero_si128();

		last_error = _mm_cvtsi32_si128(data[-1]);							// 0   0   0   le0
		itmp = data[-2];
		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   0   le0 le1
		itmp -= data[-3];
		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   le0 le1 le2
		itmp -= data[-3] - data[-4];
		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// le0 le1 le2 le3

		total_err0 = total_err1 = total_err3 = _mm_setzero_si128();
		for(i = 0; i < data_len; i++) {
			__m128i err0, err1, tmp;
			err0 = _mm_cvtsi32_si128(data[i]);								// 0   0   0   e0
			err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0));			// e0  e0  e0  e0
#if 1 /* OPT_SSE */
			err1 = _mm_sub_epi32(err1, last_error);
			last_error = _mm_srli_si128(last_error, 4);						// 0   le0 le1 le2
			err1 = _mm_sub_epi32(err1, last_error);
			last_error = _mm_srli_si128(last_error, 4);						// 0   0   le0 le1
			err1 = _mm_sub_epi32(err1, last_error);
			last_error = _mm_srli_si128(last_error, 4);						// 0   0   0   le0
			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
#else
			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8));	// le0  le1  le2+le0  le3+le1
			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4));	// le0  le1+le0  le2+le0+le1  le3+le1+le2+le0
			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
#endif
			tmp = _mm_slli_si128(err0, 12);									// e0   0   0   0
			last_error = _mm_srli_si128(err1, 4);							//  0  e1  e2  e3
			last_error = _mm_or_si128(last_error, tmp);						// e0  e1  e2  e3

			tmp = _mm_srai_epi32(err0, 31);
			err0 = _mm_xor_si128(err0, tmp);
			err0 = _mm_sub_epi32(err0, tmp);
			tmp = _mm_srai_epi32(err1, 31);
			err1 = _mm_xor_si128(err1, tmp);
			err1 = _mm_sub_epi32(err1, tmp);

			total_err0 = _mm_add_epi64(total_err0, err0);					//        0       te0
			err0 = _mm_unpacklo_epi32(err1, zero);							//   0  |e3|   0  |e4|
			err1 = _mm_unpackhi_epi32(err1, zero);							//   0  |e1|   0  |e2|
			total_err3 = _mm_add_epi64(total_err3, err0);					//       te3      te4
			total_err1 = _mm_add_epi64(total_err1, err1);					//       te1      te2
		}
	}

	m128i_to_i64(total_error_0, total_err0);
	m128i_to_i64(total_error_4, total_err3);
	m128i_to_i64(total_error_2, total_err1);
	total_err3 = _mm_srli_si128(total_err3,	8);							//         0      te3
	total_err1 = _mm_srli_si128(total_err1, 8);							//         0      te1
	m128i_to_i64(total_error_3, total_err3);
	m128i_to_i64(total_error_1, total_err1);

	/* prefer higher order */
	if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4))
		order = 0;
	else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4))
		order = 1;
	else if(total_error_2 < flac_min(total_error_3, total_error_4))
		order = 2;
	else if(total_error_3 < total_error_4)
		order = 3;
	else
		order = 4;

	/* Estimate the expected number of bits per residual signal sample. */
	/* 'total_error*' is linearly related to the variance of the residual */
	/* signal, so we use it directly to compute E(|x|) */
	FLAC__ASSERT(data_len > 0 || total_error_0 == 0);
	FLAC__ASSERT(data_len > 0 || total_error_1 == 0);
	FLAC__ASSERT(data_len > 0 || total_error_2 == 0);
	FLAC__ASSERT(data_len > 0 || total_error_3 == 0);
	FLAC__ASSERT(data_len > 0 || total_error_4 == 0);

	residual_bits_per_sample[0] = (float)((total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0);
	residual_bits_per_sample[1] = (float)((total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0);
	residual_bits_per_sample[2] = (float)((total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0);
	residual_bits_per_sample[3] = (float)((total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0);
	residual_bits_per_sample[4] = (float)((total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0);

	return order;
}
/*****************************************************************************
 * This function utilises 3 properties of the cost function lookup tables,   *
 * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
 * vp9_encoder.c.                                                            *
 * For the joint cost:                                                       *
 *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
 * For the component costs:                                                  *
 *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
 *         (Equal costs for both components)                                 *
 *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
 *         (Cost function is even)                                           *
 * If these do not hold, then this function cannot be used without           *
 * modification, in which case you can revert to using the C implementation, *
 * which does not rely on these properties.                                  *
 *****************************************************************************/
int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
                               const search_site_config *cfg,
                               MV *ref_mv, MV *best_mv, int search_param,
                               int sad_per_bit, int *num00,
                               const vp9_variance_fn_ptr_t *fn_ptr,
                               const MV *center_mv) {
  const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max);
  const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
  const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min);
  const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);

  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);

  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);

  // search_param determines the length of the initial step and hence the number
  // of iterations.
  // 0 = initial step (MAX_FIRST_STEP) pel
  // 1 = (MAX_FIRST_STEP/2) pel,
  // 2 = (MAX_FIRST_STEP/4) pel...
  const       MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
  const int tot_steps = cfg->total_steps - search_param;

  const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3,
                                        center_mv->col >> 3);
  const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);

  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);

  int_mv bmv = pack_int_mv(ref_row, ref_col);
  int_mv new_bmv = bmv;
  __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);

  const int what_stride = x->plane[0].src.stride;
  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
  const uint8_t *const what = x->plane[0].src.buf;
  const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf +
                                 ref_row * in_what_stride + ref_col;

  // Work out the start point for the search
  const uint8_t *best_address = in_what;
  const uint8_t *new_best_address = best_address;
#if ARCH_X86_64
  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
#else
  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
#endif

  unsigned int best_sad;

  int i;
  int j;
  int step;

  // Check the prerequisite cost function properties that are easy to check
  // in an assert. See the function-level documentation for details on all
  // prerequisites.
  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);

  // Check the starting position
  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);

  *num00 = 0;

  for (i = 0, step = 0; step < tot_steps; step++) {
    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
      __m128i v_sad_d;
      __m128i v_cost_d;
      __m128i v_outside_d;
      __m128i v_inside_d;
      __m128i v_diff_mv_w;
#if ARCH_X86_64
      __m128i v_blocka[2];
#else
      __m128i v_blocka[1];
#endif

      // Compute the candidate motion vectors
      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]);
      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
      // Clamp them to the search bounds
      __m128i v_these_mv_clamp_w = v_these_mv_w;
      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
      // The ones that did not change are inside the search area
      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);

      // If none of them are inside, then move on
      if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
        continue;
      }

      // The inverse mask indicates which of the MVs are outside
      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff));
      // Shift right to keep the sign bit clear, we will use this later
      // to set the cost to the maximum value.
      v_outside_d = _mm_srli_epi32(v_outside_d, 1);

      // Compute the difference MV
      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
      // We utilise the fact that the cost function is even, and use the
      // absolute difference. This allows us to use unsigned indexes later
      // and reduces cache pressure somewhat as only a half of the table
      // is ever referenced.
      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);

      // Compute the SIMD pointer offsets.
      {
#if ARCH_X86_64  //  sizeof(intptr_t) == 8
        // Load the offsets
        __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]);
        __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]);
        // Set the ones falling outside to zero
        v_bo10_q = _mm_and_si128(v_bo10_q,
                                 _mm_cvtepi32_epi64(v_inside_d));
        v_bo32_q = _mm_and_si128(v_bo32_q,
                                 _mm_unpackhi_epi32(v_inside_d, v_inside_d));
        // Compute the candidate addresses
        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
#else  // ARCH_X86 //  sizeof(intptr_t) == 4
        __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]);
        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
#endif
      }

      fn_ptr->sdx4df(what, what_stride,
                     (const uint8_t **)&v_blocka[0], in_what_stride,
                     (uint32_t*)&v_sad_d);

      // Look up the component cost of the residual motion vector
      {
        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);

        // Note: This is a use case for vpgather in AVX2
        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];

        __m128i v_cost_10_d, v_cost_32_d;

        v_cost_10_d = _mm_cvtsi32_si128(cost0);
        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);

        v_cost_32_d = _mm_cvtsi32_si128(cost2);
        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);

        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
      }

      // Now add in the joint cost
      {
        const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w,
                                                _mm_setzero_si128());
        const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d,
                                                       v_joint_cost_0_d,
                                                       v_sel_d);
        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
      }

      // Multiply by sad_per_bit
      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
      // ROUND_POWER_OF_TWO(v_cost_d, 8)
      v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80));
      v_cost_d = _mm_srai_epi32(v_cost_d, 8);
      // Add the cost to the sad
      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);

      // Make the motion vectors outside the search area have max cost
      // by or'ing in the comparison mask, this way the minimum search won't
      // pick them.
      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);

      // Find the minimum value and index horizontally in v_sad_d
      {
        // Try speculatively on 16 bits, so we can use the minpos intrinsic
        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);

        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);

        // If the local best value is not saturated, just use it, otherwise
        // find the horizontal minimum again the hard way on 32 bits.
        // This is executed rarely.
        if (__unlikely__(local_best_sad == 0xffff)) {
          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;

          v_loval_d = v_sad_d;
          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
          v_hival_d = _mm_srli_si128(v_loval_d, 8);
          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);

          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);

          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
          v_hival_d = _mm_srli_si128(v_loval_d, 4);
          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);

          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);

          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);

          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
        }

        // Update the global minimum if the local minimum is smaller
        if (__likely__(local_best_sad < best_sad)) {
          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];

          best_sad = local_best_sad;
        }
      }
    }

    bmv = new_bmv;
    best_address = new_best_address;

    v_bmv_w = _mm_set1_epi32(bmv.as_int);
#if ARCH_X86_64
    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
#else
    v_ba_d = _mm_set1_epi32((intptr_t)best_address);
#endif

    if (__unlikely__(best_address == in_what)) {
      (*num00)++;
    }
  }

  *best_mv = bmv.as_mv;
  return best_sad;
}
/**
*******************************************************************************
*
* @brief
*  Performs spatial edge adaptive filtering
*
* @par   Description
*  Performs spatial edge adaptive filtering by detecting edge direction
*
* @param[in] pu1_src
*  Source buffer
*
* @param[in] pu1_out
*  Destination buffer
*
* @param[in] src_strd
*  Source stride
*
* @param[in] out_strd
*  Destination stride

* @returns
* None
*
* @remarks
*
*******************************************************************************
*/
void ideint_spatial_filter_ssse3(UWORD8 *pu1_src,
                           UWORD8 *pu1_out,
                           WORD32 src_strd,
                           WORD32 out_strd)
{
    WORD32 i;

    WORD32 adiff[6];
    WORD32 *pi4_diff;
    WORD32 shifts[2];
    WORD32 dir_45_le_90, dir_45_le_135, dir_135_le_90;

    __m128i row1_0, row1_m1, row1_p1;
    __m128i row2_0, row2_m1, row2_p1;
    __m128i diff, diffs[3];
    __m128i zero;

    /*****************************************************************/
    /* Direction detection                                           */
    /*****************************************************************/

    zero = _mm_setzero_si128();
    diffs[0] = _mm_setzero_si128();
    diffs[1]  = _mm_setzero_si128();
    diffs[2] = _mm_setzero_si128();

    /* Load source */
    row1_m1 = _mm_loadl_epi64((__m128i *) (pu1_src - 1));
    row1_0  = _mm_loadl_epi64((__m128i *) (pu1_src));
    row1_p1 = _mm_loadl_epi64((__m128i *) (pu1_src + 1));
    pu1_src += src_strd;

    /* Unpack to 16 bits */
    row1_m1 = _mm_unpacklo_epi8(row1_m1, zero);
    row1_0  = _mm_unpacklo_epi8(row1_0,  zero);
    row1_p1 = _mm_unpacklo_epi8(row1_p1, zero);

    /*****************************************************************/
    /* Calculating the difference along each of the 3 directions.    */
    /*****************************************************************/
    for(i = 0; i < SUB_BLK_HT; i ++)
    {
        row2_m1 = _mm_loadl_epi64((__m128i *) (pu1_src - 1));
        row2_0  = _mm_loadl_epi64((__m128i *) (pu1_src));
        row2_p1 = _mm_loadl_epi64((__m128i *) (pu1_src + 1));
        pu1_src += src_strd;

        /* Unpack to 16 bits */
        row2_m1 = _mm_unpacklo_epi8(row2_m1, zero);
        row2_0  = _mm_unpacklo_epi8(row2_0,  zero);
        row2_p1 = _mm_unpacklo_epi8(row2_p1, zero);

        diff    = _mm_sad_epu8(row1_0, row2_0);
        diffs[0]  = _mm_add_epi64(diffs[0], diff);

        diff    = _mm_sad_epu8(row1_m1, row2_p1);
        diffs[1] = _mm_add_epi64(diffs[1], diff);

        diff    = _mm_sad_epu8(row1_p1, row2_m1);
        diffs[2]  = _mm_add_epi64(diffs[2], diff);

        row1_m1 = row2_m1;
        row1_0 = row2_0;
        row1_p1 = row2_p1;
    }
    /* Revert pu1_src increment */
    pu1_src -= (SUB_BLK_HT + 1) * src_strd;


    adiff[0] = _mm_cvtsi128_si32(diffs[0]);
    adiff[1] = _mm_cvtsi128_si32(diffs[1]);
    adiff[2] = _mm_cvtsi128_si32(diffs[2]);
    adiff[3] = _mm_cvtsi128_si32(_mm_srli_si128(diffs[0], 8));
    adiff[4] = _mm_cvtsi128_si32(_mm_srli_si128(diffs[1], 8));
    adiff[5] = _mm_cvtsi128_si32(_mm_srli_si128(diffs[2], 8));
    pi4_diff = adiff;

    for(i = 0; i < 2; i++)
    {
        /*****************************************************************/
        /* Applying bias, to make the diff comparision more robust.      */
        /*****************************************************************/
        pi4_diff[0] *= EDGE_BIAS_0;
        pi4_diff[1] *= EDGE_BIAS_1;
        pi4_diff[2] *= EDGE_BIAS_1;

        /*****************************************************************/
        /* comapring the diffs */
        /*****************************************************************/
        dir_45_le_90  = (pi4_diff[2] <= pi4_diff[0]);
        dir_45_le_135 = (pi4_diff[2] <= pi4_diff[1]);
        dir_135_le_90 = (pi4_diff[1] <= pi4_diff[0]);

        /*****************************************************************/
        /* Direction selection. */
        /*****************************************************************/
        shifts[i] = 0;
        if(1 == dir_45_le_135)
        {
            if(1 == dir_45_le_90)
                shifts[i] = 1;
        }
        else
        {
            if(1 == dir_135_le_90)
                shifts[i] = -1;
        }
        pi4_diff += 3;
    }
    /*****************************************************************/
    /* Directional interpolation */
    /*****************************************************************/
    for(i = 0; i < SUB_BLK_HT / 2; i++)
    {
        __m128i dst;
        __m128i row1, row2;

        UWORD32 *pu4_row1th, *pu4_row1tl;
        UWORD32 *pu4_row2th, *pu4_row2tl;
        UWORD32 *pu4_row1bh, *pu4_row1bl;
        UWORD32 *pu4_row2bh, *pu4_row2bl;

        pu4_row1th  = (UWORD32 *)(pu1_src + shifts[0]);
        pu4_row1tl  = (UWORD32 *)(pu1_src + SUB_BLK_WD + shifts[1]);

        pu1_src += src_strd;
        pu4_row2th  = (UWORD32 *)(pu1_src + shifts[0]);
        pu4_row2tl  = (UWORD32 *)(pu1_src + SUB_BLK_WD + shifts[1]);

        pu4_row1bh  = (UWORD32 *)(pu1_src - shifts[0]);
        pu4_row1bl  = (UWORD32 *)(pu1_src + SUB_BLK_WD - shifts[1]);

        pu1_src += src_strd;
        pu4_row2bh  = (UWORD32 *)(pu1_src - shifts[0]);
        pu4_row2bl  = (UWORD32 *)(pu1_src + SUB_BLK_WD - shifts[1]);

        row1 = _mm_set_epi32(*pu4_row1tl, *pu4_row1th, *pu4_row2tl, *pu4_row2th);
        row2 = _mm_set_epi32(*pu4_row1bl, *pu4_row1bh, *pu4_row2bl, *pu4_row2bh);

        dst = _mm_avg_epu8(row1, row2);

        _mm_storel_epi64((__m128i *)pu1_out, _mm_srli_si128(dst, 8));
        pu1_out += out_strd;

        _mm_storel_epi64((__m128i *)pu1_out, dst);
        pu1_out += out_strd;
    }
}
Ejemplo n.º 27
0
mlib_status
__mlib_VectorSumAbsDiff_S32_Sat(
    mlib_d64 *z,
    const mlib_s32 *x,
    const mlib_s32 *y,
    mlib_s32 n)
{
    if (n <= 0)
        return (MLIB_FAILURE);

    mlib_s32 i, nstep, ax, ay, n1, n2, n3;
    mlib_s32 *px = (mlib_s32 *)x, *py = (mlib_s32 *)y;
    __m128i zero, xbuf, ybuf, zbuf, xlo, xhi, mext;
    mlib_d64 dsum = 0.0;
    zero = _mm_setzero_si128();
    zbuf = zero;

    nstep = 16 / sizeof (mlib_s32);
    ax = (mlib_addr)x & 15;
    ay = (mlib_addr)y & 15;
    n1 = ((16 - ax) & 15) / sizeof (mlib_s32);
    n2 = (n - n1) / nstep;
    n3 = n - n1 - n2 * nstep;

    if (n2 < 1) {
        for (i = 0; i < n; i++) {
            dsum += mlib_fabs((mlib_d64)(*px++) - (*py++));
        }
        *z = dsum;
    } else {
        for (i = 0; i < n1; i++) {
            dsum += mlib_fabs((mlib_d64)(*px++) - (*py++));
        }
        if (ax == ay) {
            for (i = 0; i < n2; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_load_si128((__m128i *)py);
                mext = _mm_cmpgt_epi32(ybuf, xbuf);
                xbuf = _mm_sub_epi32(xbuf, ybuf);
                xbuf = _mm_xor_si128(xbuf, mext);
                xbuf = _mm_sub_epi32(xbuf, mext);
                xlo = _mm_unpacklo_epi32(xbuf, zero);
                xhi = _mm_unpackhi_epi32(xbuf, zero);
                zbuf = _mm_add_epi64(zbuf, xlo);
                zbuf = _mm_add_epi64(zbuf, xhi);
                px += nstep;
                py += nstep;
            }
        } else {
            for (i = 0; i < n2; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_loadu_si128((__m128i *)py);
                mext = _mm_cmpgt_epi32(ybuf, xbuf);
                xbuf = _mm_sub_epi32(xbuf, ybuf);
                xbuf = _mm_xor_si128(xbuf, mext);
                xbuf = _mm_sub_epi32(xbuf, mext);
                xlo = _mm_unpacklo_epi32(xbuf, zero);
                xhi = _mm_unpackhi_epi32(xbuf, zero);
                zbuf = _mm_add_epi64(zbuf, xlo);
                zbuf = _mm_add_epi64(zbuf, xhi);
                px += nstep;
                py += nstep;
            }
        }
        for (i = 0; i < n3; i++) {
            dsum += mlib_fabs((mlib_d64)(*px++) - (*py++));
        }

        long long pz[2];
        _mm_storeu_si128((__m128i *)pz, zbuf);
        dsum += pz[0];
        dsum += pz[1];
        *z = dsum;
    }
    return (MLIB_SUCCESS);
}
Ejemplo n.º 28
0
mlib_status
__mlib_VectorSumAbsDiff_S16_Sat(
    mlib_d64 *z,
    const mlib_s16 *x,
    const mlib_s16 *y,
    mlib_s32 n)
{
    if (n <= 0)
        return (MLIB_FAILURE);

    mlib_s32 i, nstep, ax, ay, n1, n2, n3, xval, sum = 0;
    mlib_s16 *px = (mlib_s16 *)x, *py = (mlib_s16 *)y;
    __m128i zero, xbuf, ybuf, zbuf32, zbuf64, xlo, xhi, mext;
    zero = _mm_setzero_si128();
    zbuf64 = zero;

    nstep = 16 / sizeof (mlib_s16);
    ax = (mlib_addr)x & 15;
    ay = (mlib_addr)y & 15;
    n1 = ((16 - ax) & 15) / sizeof (mlib_s16);
    n2 = (n - n1) / nstep;
    n3 = n - n1 - n2 * nstep;

    if (n2 < 1) {
        for (i = 0; i < n; i++) {
            xval = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(xval);
        }
        *z = sum;
    } else {
        for (i = 0; i < n1; i++) {
            xval = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(xval);
        }
        mlib_s32 nblock = n2 >> 12;
        mlib_s32 tail = n2 & 4095;
        mlib_s32 k;
        if (ax == ay) {
            for (k = 0; k < nblock; k++) {
                zbuf32 = zero;
                for (i = 0; i < 4096; i++) {
                    xbuf = _mm_load_si128((__m128i *)px);
                    ybuf = _mm_load_si128((__m128i *)py);
                    mext = _mm_cmpgt_epi16(ybuf, xbuf);
                    xbuf = _mm_sub_epi16(xbuf, ybuf);
                    xbuf = _mm_xor_si128(xbuf, mext);
                    xbuf = _mm_sub_epi16(xbuf, mext);
                    xlo = _mm_unpacklo_epi16(xbuf, zero);
                    xhi = _mm_unpackhi_epi16(xbuf, zero);
                    zbuf32 = _mm_add_epi32(zbuf32, xlo);
                    zbuf32 = _mm_add_epi32(zbuf32, xhi);
                    px += nstep;
                    py += nstep;
                }
                xlo = _mm_unpacklo_epi32(zbuf32, zero);
                xhi = _mm_unpackhi_epi32(zbuf32, zero);
                zbuf64 = _mm_add_epi64(zbuf64, xlo);
                zbuf64 = _mm_add_epi64(zbuf64, xhi);
            }
            zbuf32 = zero;
            for (i = 0; i < tail; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_load_si128((__m128i *)py);
                mext = _mm_cmpgt_epi16(ybuf, xbuf);
                xbuf = _mm_sub_epi16(xbuf, ybuf);
                xbuf = _mm_xor_si128(xbuf, mext);
                xbuf = _mm_sub_epi16(xbuf, mext);
                xlo = _mm_unpacklo_epi16(xbuf, zero);
                xhi = _mm_unpackhi_epi16(xbuf, zero);
                zbuf32 = _mm_add_epi32(zbuf32, xlo);
                zbuf32 = _mm_add_epi32(zbuf32, xhi);
                px += nstep;
                py += nstep;
            }
            xlo = _mm_unpacklo_epi32(zbuf32, zero);
            xhi = _mm_unpackhi_epi32(zbuf32, zero);
            zbuf64 = _mm_add_epi64(zbuf64, xlo);
            zbuf64 = _mm_add_epi64(zbuf64, xhi);
        } else { /* not aligned */
            for (k = 0; k < nblock; k++) {
                zbuf32 = zero;
                for (i = 0; i < 4096; i++) {
                    xbuf = _mm_load_si128((__m128i *)px);
                    ybuf = _mm_loadu_si128((__m128i *)py);
                    mext = _mm_cmpgt_epi16(ybuf, xbuf);
                    xbuf = _mm_sub_epi16(xbuf, ybuf);
                    xbuf = _mm_xor_si128(xbuf, mext);
                    xbuf = _mm_sub_epi16(xbuf, mext);
                    xlo = _mm_unpacklo_epi16(xbuf, zero);
                    xhi = _mm_unpackhi_epi16(xbuf, zero);
                    zbuf32 = _mm_add_epi32(zbuf32, xlo);
                    zbuf32 = _mm_add_epi32(zbuf32, xhi);
                    px += nstep;
                    py += nstep;
                }
                xlo = _mm_unpacklo_epi32(zbuf32, zero);
                xhi = _mm_unpackhi_epi32(zbuf32, zero);
                zbuf64 = _mm_add_epi64(zbuf64, xlo);
                zbuf64 = _mm_add_epi64(zbuf64, xhi);
            }
            zbuf32 = zero;
            for (i = 0; i < tail; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_loadu_si128((__m128i *)py);
                mext = _mm_cmpgt_epi16(ybuf, xbuf);
                xbuf = _mm_sub_epi16(xbuf, ybuf);
                xbuf = _mm_xor_si128(xbuf, mext);
                xbuf = _mm_sub_epi16(xbuf, mext);
                xlo = _mm_unpacklo_epi16(xbuf, zero);
                xhi = _mm_unpackhi_epi16(xbuf, zero);
                zbuf32 = _mm_add_epi32(zbuf32, xlo);
                zbuf32 = _mm_add_epi32(zbuf32, xhi);
                px += nstep;
                py += nstep;
            }
            xlo = _mm_unpacklo_epi32(zbuf32, zero);
            xhi = _mm_unpackhi_epi32(zbuf32, zero);
            zbuf64 = _mm_add_epi64(zbuf64, xlo);
            zbuf64 = _mm_add_epi64(zbuf64, xhi);
        }
        for (i = 0; i < n3; i++) {
            xval = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(xval);
        }

        mlib_d64 dsum = sum;
        long long pz[2];
        _mm_storeu_si128((__m128i *)pz, zbuf64);
        dsum += pz[0];
        dsum += pz[1];
        *z = dsum;
    }
    return (MLIB_SUCCESS);
}
Ejemplo n.º 29
0
mlib_status
__mlib_VectorSumAbsDiff_S8_Sat(
    mlib_d64 *z,
    const mlib_s8 *x,
    const mlib_s8 *y,
    mlib_s32 n)
{
    if (n <= 0)
        return (MLIB_FAILURE);

    mlib_s32 i, nstep, ax, ay, n1, n2, n3, diff, sum = 0;
    mlib_s8 *px = (mlib_s8 *)x, *py = (mlib_s8 *)y;
    __m128i zero, xbuf, ybuf, zbuf, mext, mbuf;
    zero = _mm_setzero_si128();
    zbuf = zero;

    nstep = 16 / sizeof (mlib_s8);
    ax = (mlib_addr)x & 15;
    ay = (mlib_addr)y & 15;
    n1 = ((16 - ax) & 15) / sizeof (mlib_s8);
    n2 = (n - n1) / nstep;
    n3 = n - n1 - n2 * nstep;

    if (n2 < 1) {
        for (i = 0; i < n; i++) {
            diff = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(diff);
        }
        *z = sum;
    } else {
        for (i = 0; i < n1; i++) {
            diff = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(diff);
        }
        if (ax == ay) {
            for (i = 0; i < n2; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_load_si128((__m128i *)py);
                mext = _mm_cmpgt_epi8(ybuf, xbuf);
                mbuf = _mm_sub_epi8(xbuf, ybuf);
                mbuf = _mm_xor_si128(mbuf, mext);
                mbuf = _mm_sub_epi8(mbuf, mext);
                mbuf = _mm_sad_epu8(mbuf, zero);
                zbuf = _mm_add_epi64(zbuf, mbuf);
                px += nstep;
                py += nstep;
            }
        } else {
            for (i = 0; i < n2; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_loadu_si128((__m128i *)py);
                mext = _mm_cmpgt_epi8(ybuf, xbuf);
                mbuf = _mm_sub_epi8(xbuf, ybuf);
                mbuf = _mm_xor_si128(mbuf, mext);
                mbuf = _mm_sub_epi8(mbuf, mext);
                mbuf = _mm_sad_epu8(mbuf, zero);
                zbuf = _mm_add_epi64(zbuf, mbuf);
                px += nstep;
                py += nstep;
            }
        }
        for (i = 0; i < n3; i++) {
            diff = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(diff);
        }

        mlib_d64 dsum = sum;
        long long pz[2];
        _mm_storeu_si128((__m128i *)pz, zbuf);
        dsum += pz[0];
        dsum += pz[1];
        *z = dsum;
    }
    return (MLIB_SUCCESS);
}
void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
		unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps)
{
	const unsigned default_partition_samples = (residual_samples + predictor_order) >> max_partition_order;
	unsigned partitions = 1u << max_partition_order;

	FLAC__ASSERT(default_partition_samples > predictor_order);

	/* first do max_partition_order */
	{
		const unsigned threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples);
		unsigned partition, residual_sample, end = (unsigned)(-(int)predictor_order);

		if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) {
			for(partition = residual_sample = 0; partition < partitions; partition++) {
				__m128i mm_sum = _mm_setzero_si128();
				unsigned e1, e3;
				end += default_partition_samples;

				e1 = (residual_sample + 3) & ~3; e3 = end & ~3;
				if(e1 > end)
					e1 = end; /* try flac -l 1 -b 16 and you'll be here */

				/* assumption: residual[] is properly aligned so (residual + e1) is properly aligned too and _mm_loadu_si128() is fast */
				for( ; residual_sample < e1; residual_sample++) {
					__m128i mm_res = _mm_cvtsi32_si128(residual[residual_sample]);
					__m128i mm_mask = _mm_srai_epi32(mm_res, 31);
					mm_res = _mm_xor_si128(mm_res, mm_mask);
					mm_res = _mm_sub_epi32(mm_res, mm_mask); /* abs(INT_MIN) is undefined, but if the residual is INT_MIN we have bigger problems */
					mm_sum = _mm_add_epi32(mm_sum, mm_res);
				}

				for( ; residual_sample < e3; residual_sample+=4) {
					__m128i mm_res = _mm_loadu_si128((const __m128i*)(residual+residual_sample));
					__m128i mm_mask = _mm_srai_epi32(mm_res, 31);
					mm_res = _mm_xor_si128(mm_res, mm_mask);
					mm_res = _mm_sub_epi32(mm_res, mm_mask);
					mm_sum = _mm_add_epi32(mm_sum, mm_res);
				}

				for( ; residual_sample < end; residual_sample++) {
					__m128i mm_res = _mm_cvtsi32_si128(residual[residual_sample]);
					__m128i mm_mask = _mm_srai_epi32(mm_res, 31);
					mm_res = _mm_xor_si128(mm_res, mm_mask);
					mm_res = _mm_sub_epi32(mm_res, mm_mask);
					mm_sum = _mm_add_epi32(mm_sum, mm_res);
				}

				mm_sum = _mm_add_epi32(mm_sum, _mm_srli_si128(mm_sum, 8));
				mm_sum = _mm_add_epi32(mm_sum, _mm_srli_si128(mm_sum, 4));
				abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum);
			}
		}
		else { /* have to pessimistically use 64 bits for accumulator */
			for(partition = residual_sample = 0; partition < partitions; partition++) {
				__m128i mm_sum = _mm_setzero_si128();
				unsigned e1, e3;
				end += default_partition_samples;

				e1 = (residual_sample + 1) & ~1; e3 = end & ~1;
				FLAC__ASSERT(e1 <= end);

				for( ; residual_sample < e1; residual_sample++) {
					__m128i mm_res = _mm_cvtsi32_si128(residual[residual_sample]); /*  0   0   0   r0 */
					__m128i mm_mask = _mm_srai_epi32(mm_res, 31);
					mm_res = _mm_xor_si128(mm_res, mm_mask);
					mm_res = _mm_sub_epi32(mm_res, mm_mask); /*  0   0   0  |r0|  ==   00   |r0_64| */
					mm_sum = _mm_add_epi64(mm_sum, mm_res);
				}

				for( ; residual_sample < e3; residual_sample+=2) {
					__m128i mm_res = _mm_loadl_epi64((const __m128i*)(residual+residual_sample)); /*  0   0   r1  r0 */
					__m128i mm_mask = _mm_srai_epi32(mm_res, 31);
					mm_res = _mm_xor_si128(mm_res, mm_mask);
					mm_res = _mm_sub_epi32(mm_res, mm_mask); /*  0   0  |r1|   |r0| */
					mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0  |r1|  0  |r0|  ==  |r1_64|  |r0_64|  */
					mm_sum = _mm_add_epi64(mm_sum, mm_res);
				}

				for( ; residual_sample < end; residual_sample++) {
					__m128i mm_res = _mm_cvtsi32_si128(residual[residual_sample]);
					__m128i mm_mask = _mm_srai_epi32(mm_res, 31);
					mm_res = _mm_xor_si128(mm_res, mm_mask);
					mm_res = _mm_sub_epi32(mm_res, mm_mask);
					mm_sum = _mm_add_epi64(mm_sum, mm_res);
				}

				mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8));
				_mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), mm_sum);
			}
		}
	}

	/* now merge partitions for lower orders */
	{
		unsigned from_partition = 0, to_partition = partitions;
		int partition_order;
		for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) {
			unsigned i;
			partitions >>= 1;
			for(i = 0; i < partitions; i++) {
				abs_residual_partition_sums[to_partition++] =
					abs_residual_partition_sums[from_partition  ] +
					abs_residual_partition_sums[from_partition+1];
				from_partition += 2;
			}
		}
	}
}