Exemple #1
0
void srslte_tdec_sse_decision(srslte_tdec_sse_t * h, uint8_t *output, uint32_t long_cb)
{
  __m128i zero     = _mm_set1_epi16(0);
  __m128i lsb_mask = _mm_set1_epi16(1);
  
  __m128i *appPtr = (__m128i*) h->app1;
  __m128i *outPtr = (__m128i*) output;
  __m128i ap, out, out0, out1; 
    
  for (uint32_t i = 0; i < long_cb/16; i++) {
    ap   = _mm_load_si128(appPtr); appPtr++;    
    out0 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask);
    ap   = _mm_load_si128(appPtr); appPtr++;
    out1 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask);
    
    out  = _mm_packs_epi16(out0, out1);
    _mm_store_si128(outPtr, out);
    outPtr++;
  }
  if (long_cb%16) {
    for (int i=0;i<8;i++) {
      output[long_cb-8+i] = h->app1[long_cb-8+i]>0?1:0;
    }
  }
}
static INLINE unsigned
build_mask_linear(int c, int dcdx, int dcdy)
{
   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
   __m128i xdcdy = _mm_set1_epi32(dcdy);

   /* Get values across the quad
    */
   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);

   /* pack pairs of results into epi16
    */
   __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
   __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);

   /* pack into epi8, preserving sign bits
    */
   __m128i result = _mm_packs_epi16(cstep01, cstep23);

   /* extract sign bits to create mask
    */
   return _mm_movemask_epi8(result);
}
Exemple #3
0
static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
  uint8_t levels[16], ctxs[16];
  uint16_t abs_levels[16];
  int n = res->first;
  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
  const int p0 = res->prob[n][ctx0][0];
  CostArrayPtr const costs = res->costs;
  const uint16_t* t = costs[n][ctx0];
  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
  // be missing during the loop.
  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;

  if (res->last < 0) {
    return VP8BitCost(0, p0);
  }

  {   // precompute clamped levels and contexts, packed to 8b.
    const __m128i zero = _mm_setzero_si128();
    const __m128i kCst2 = _mm_set1_epi8(2);
    const __m128i kCst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL);
    const __m128i c0 = _mm_loadu_si128((const __m128i*)&res->coeffs[0]);
    const __m128i c1 = _mm_loadu_si128((const __m128i*)&res->coeffs[8]);
    const __m128i D0 = _mm_sub_epi16(zero, c0);
    const __m128i D1 = _mm_sub_epi16(zero, c1);
    const __m128i E0 = _mm_max_epi16(c0, D0);   // abs(v), 16b
    const __m128i E1 = _mm_max_epi16(c1, D1);
    const __m128i F = _mm_packs_epi16(E0, E1);
    const __m128i G = _mm_min_epu8(F, kCst2);    // context = 0,1,2
    const __m128i H = _mm_min_epu8(F, kCst67);   // clamp_level in [0..67]

    _mm_storeu_si128((__m128i*)&ctxs[0], G);
    _mm_storeu_si128((__m128i*)&levels[0], H);

    _mm_storeu_si128((__m128i*)&abs_levels[0], E0);
    _mm_storeu_si128((__m128i*)&abs_levels[8], E1);
  }
  for (; n < res->last; ++n) {
    const int ctx = ctxs[n];
    const int level = levels[n];
    const int flevel = abs_levels[n];   // full level
    cost += VP8LevelFixedCosts[flevel] + t[level];  // simplified VP8LevelCost()
    t = costs[n + 1][ctx];
  }
  // Last coefficient is always non-zero
  {
    const int level = levels[n];
    const int flevel = abs_levels[n];
    assert(flevel != 0);
    cost += VP8LevelFixedCosts[flevel] + t[level];
    if (n < 15) {
      const int b = VP8EncBands[n + 1];
      const int ctx = ctxs[n];
      const int last_p0 = res->prob[b][ctx][0];
      cost += VP8BitCost(0, last_p0);
    }
  }
  return cost;
}
Exemple #4
0
__m128i test_mm_packs_epi16(__m128i A, __m128i B) {
  // DAG-LABEL: test_mm_packs_epi16
  // DAG: call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
  //
  // ASM-LABEL: test_mm_packs_epi16
  // ASM: packsswb
  return _mm_packs_epi16(A, B);
}
mlib_status
__mlib_VectorConvert_S8_S32_Sat(
	mlib_s8 *z,
	const mlib_s32 *x,
	mlib_s32 n)
{
	if (n < 1)
		return (MLIB_FAILURE);

	mlib_s32 i, ax, az, nstep, n1, n2, n3, xval;
	mlib_s32 *px = (mlib_s32 *)x;
	mlib_s8 *pz = (mlib_s8 *)z;
	__m128i zbuf, buf1, buf2, buf3, buf4, mask;

	ax = (mlib_addr)x & 15;
	az = (mlib_addr)z & 15;

	nstep = 16 / sizeof (mlib_s8);
	n1 = ((16 - ax) & 15) / sizeof (mlib_s32);
	n2 = (n - n1) / nstep;
	n3 = n - n1 - n2 * nstep;

	if (n2 < 1) {
		for (i = 0; i < n; i++) {
			xval = *px++;
			SAT_S8(xval);
			*pz++ = xval;
		}
	} else {
		for (i = 0; i < n1; i++) {
			xval = *px++;
			SAT_S8(xval);
			*pz++ = xval;
		}

		for (i = 0; i < n2; i++) {
			buf1 = _mm_load_si128((__m128i *)px);
			buf2 = _mm_load_si128((__m128i *)px + 1);
			buf3 = _mm_load_si128((__m128i *)px + 2);
			buf4 = _mm_load_si128((__m128i *)px + 3);
			buf1 = _mm_packs_epi32(buf1, buf2);
			buf3 = _mm_packs_epi32(buf3, buf4);
			zbuf = _mm_packs_epi16(buf1, buf3);
			_mm_storeu_si128((__m128i *)pz, zbuf);
			px += nstep;
			pz += nstep;
		}

		for (i = 0; i < n3; i++) {
			xval = *px++;
			SAT_S8(xval);
			*pz++ = xval;
		}
	}

	return (MLIB_SUCCESS);
}
Exemple #6
0
// Shift each byte of "x" by 3 bits while preserving by the sign bit.
static WEBP_INLINE void SignedShift8b(__m128i* const x) {
    const __m128i zero = _mm_setzero_si128();
    const __m128i signs = _mm_cmpgt_epi8(zero, *x);
    const __m128i lo_0 = _mm_unpacklo_epi8(*x, signs);  // s8 -> s16 sign extend
    const __m128i hi_0 = _mm_unpackhi_epi8(*x, signs);
    const __m128i lo_1 = _mm_srai_epi16(lo_0, 3);
    const __m128i hi_1 = _mm_srai_epi16(hi_0, 3);
    *x = _mm_packs_epi16(lo_1, hi_1);
}
static INLINE void
build_masks(int c, 
	    int cdiff,
	    int dcdx,
	    int dcdy,
	    unsigned *outmask,
	    unsigned *partmask)
{
   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
   __m128i xdcdy = _mm_set1_epi32(dcdy);

   /* Get values across the quad
    */
   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);

   {
      __m128i cstep01, cstep23, result;

      cstep01 = _mm_packs_epi32(cstep0, cstep1);
      cstep23 = _mm_packs_epi32(cstep2, cstep3);
      result = _mm_packs_epi16(cstep01, cstep23);

      *outmask |= _mm_movemask_epi8(result);
   }


   {
      __m128i cio4 = _mm_set1_epi32(cdiff);
      __m128i cstep01, cstep23, result;

      cstep0 = _mm_add_epi32(cstep0, cio4);
      cstep1 = _mm_add_epi32(cstep1, cio4);
      cstep2 = _mm_add_epi32(cstep2, cio4);
      cstep3 = _mm_add_epi32(cstep3, cio4);

      cstep01 = _mm_packs_epi32(cstep0, cstep1);
      cstep23 = _mm_packs_epi32(cstep2, cstep3);
      result = _mm_packs_epi16(cstep01, cstep23);

      *partmask |= _mm_movemask_epi8(result);
   }
}
Exemple #8
0
// Updates values of 2 pixels at MB edge during complex filtering.
// Update operations:
// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
// Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
                                      const __m128i* const a0_lo,
                                      const __m128i* const a0_hi) {
    const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
    const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
    const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
    const __m128i sign_bit = _mm_set1_epi8(0x80);
    *pi = _mm_adds_epi8(*pi, delta);
    *qi = _mm_subs_epi8(*qi, delta);
    FLIP_SIGN_BIT2(*pi, *qi);
}
static void aom_filter_block1d8_h4_ssse3(
    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i filtersReg;
  __m128i addFilterReg32, filt2Reg, filt3Reg;
  __m128i secondFilters, thirdFilters;
  __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
  __m128i srcReg32b1;
  unsigned int i;
  src_ptr -= 3;
  addFilterReg32 = _mm_set1_epi16(32);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  filtersReg = _mm_srai_epi16(filtersReg, 1);
  // converting the 16 bit (short) to 8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);

  // duplicate only the second 16 bits (third and forth byte)
  // across 256 bit register
  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
  // duplicate only the third 16 bits (fifth and sixth byte)
  // across 256 bit register
  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));

  filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
  filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));

  for (i = output_height; i > 0; i -= 1) {
    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);

    // filter the source buffer
    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);

    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);

    // shift by 6 bit each 16 bit
    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);

    // shrink to 8 bit each 16 bits
    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());

    src_ptr += src_pixels_per_line;

    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);

    output_ptr += output_pitch;
  }
}
Exemple #10
0
void demod_16qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols) {
  float *symbolsPtr = (float*) symbols;
  __m128i *resultPtr = (__m128i*) llr;
  __m128 symbol1, symbol2, symbol3, symbol4;
  __m128i symbol_i1, symbol_i2, symbol_i3, symbol_i4, symbol_i, symbol_abs, symbol_12, symbol_34;
  __m128i offset = _mm_set1_epi8(2*SCALE_BYTE_CONV_QAM16/sqrt(10));
  __m128i result1n, result1a, result2n, result2a;
  __m128 scale_v = _mm_set1_ps(-SCALE_BYTE_CONV_QAM16);

  __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
  __m128i shuffle_abs_1     = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);

  __m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
  __m128i shuffle_abs_2     = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);

  for (int i=0;i<nsymbols/8;i++) {
    symbol1   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol2   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol3   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol4   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v));
    symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v));
    symbol_i3 = _mm_cvtps_epi32(_mm_mul_ps(symbol3, scale_v));
    symbol_i4 = _mm_cvtps_epi32(_mm_mul_ps(symbol4, scale_v));
    symbol_12  = _mm_packs_epi32(symbol_i1, symbol_i2);
    symbol_34  = _mm_packs_epi32(symbol_i3, symbol_i4);
    symbol_i   = _mm_packs_epi16(symbol_12, symbol_34);

    symbol_abs  = _mm_abs_epi8(symbol_i);
    symbol_abs  = _mm_sub_epi8(symbol_abs, offset);

    result1n = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);
    result1a = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);

    result2n = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);
    result2a = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);

    _mm_store_si128(resultPtr, _mm_or_si128(result1n, result1a)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(result2n, result2a)); resultPtr++;

  }
  // Demodulate last symbols
  for (int i=8*(nsymbols/8);i<nsymbols;i++) {
    short yre = (int8_t) (SCALE_BYTE_CONV_QAM16*crealf(symbols[i]));
    short yim = (int8_t) (SCALE_BYTE_CONV_QAM16*cimagf(symbols[i]));

    llr[4*i+0] = -yre;
    llr[4*i+1] = -yim;
    llr[4*i+2] = abs(yre)-2*SCALE_BYTE_CONV_QAM16/sqrt(10);
    llr[4*i+3] = abs(yim)-2*SCALE_BYTE_CONV_QAM16/sqrt(10);
  }
}
Exemple #11
0
__m64 _m_packsswb(__m64 _MM1, __m64 _MM2)
{
    __m128i lhs = {0}, rhs = {0};
    lhs.m128i_i64[0] = _MM1.m64_i64;

    rhs.m128i_i64[0] = _MM2.m64_i64;

    lhs = _mm_packs_epi16(lhs, rhs);

    _MM1.m64_i32[0] = lhs.m128i_i32[0];
    _MM1.m64_i32[1] = lhs.m128i_i32[2];
    return _MM1;
}
Exemple #12
0
static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
                                  VP8Residual* const res) {
  const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
  const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
  // Use SSE2 to compare 16 values with a single instruction.
  const __m128i zero = _mm_setzero_si128();
  const __m128i m0 = _mm_packs_epi16(c0, c1);
  const __m128i m1 = _mm_cmpeq_epi8(m0, zero);
  // Get the comparison results as a bitmask into 16bits. Negate the mask to get
  // the position of entries that are not equal to zero. We don't need to mask
  // out least significant bits according to res->first, since coeffs[0] is 0
  // if res->first > 0.
  const uint32_t mask = 0x0000ffffu ^ (uint32_t)_mm_movemask_epi8(m1);
  // The position of the most significant non-zero bit indicates the position of
  // the last non-zero value.
  assert(res->first == 0 || coeffs[0] == 0);
  res->last = mask ? BitsLog2Floor(mask) : -1;
  res->coeffs = coeffs;
}
static void aom_filter_block1d4_h4_ssse3(
    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i filtersReg;
  __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
  unsigned int i;
  src_ptr -= 3;
  addFilterReg32 = _mm_set1_epi16(32);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  filtersReg = _mm_srai_epi16(filtersReg, 1);
  // converting the 16 bit (short) to 8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);

  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
  filt1Reg = _mm_load_si128((__m128i const *)(filtd4));

  for (i = output_height; i > 0; i -= 1) {
    // load the 2 strides of source
    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);

    // filter the source buffer
    srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg);

    // multiply 4 adjacent elements with the filter and add the result
    srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters);

    srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128());

    // shift by 6 bit each 16 bit
    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve result
    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());

    src_ptr += src_pixels_per_line;

    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
    output_ptr += output_pitch;
  }
}
static INLINE unsigned
sign_bits4(const __m128i *cstep, int cdiff)
{

   /* Adjust the step values
    */
   __m128i cio4 = _mm_set1_epi32(cdiff);
   __m128i cstep0 = _mm_add_epi32(cstep[0], cio4);
   __m128i cstep1 = _mm_add_epi32(cstep[1], cio4);
   __m128i cstep2 = _mm_add_epi32(cstep[2], cio4);
   __m128i cstep3 = _mm_add_epi32(cstep[3], cio4);

   /* Pack down to epi8
    */
   __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
   __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
   __m128i result = _mm_packs_epi16(cstep01, cstep23);

   /* Extract the sign bits
    */
   return _mm_movemask_epi8(result);
}
static void GF_FUNC_ALIGN VS_CC
float_to_dst_8bit(const float *srcp, uint8_t *dstp, int width, int height,
                  int src_stride, int dst_stride, float th, int bits)
{
    __m128 tmax = _mm_set1_ps(th);

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x += 16) {
            __m128 xmf0 = _mm_cmpge_ps(_mm_load_ps(srcp + x), tmax);
            __m128 xmf1 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 4), tmax);
            __m128 xmf2 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 8), tmax);
            __m128 xmf3 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 12), tmax);
            __m128i xmi0 = _mm_packs_epi32(_mm_castps_si128(xmf0),
                                           _mm_castps_si128(xmf1));
            __m128i xmi1 = _mm_packs_epi32(_mm_castps_si128(xmf2),
                                           _mm_castps_si128(xmf3));
            xmi0 = _mm_packs_epi16(xmi0, xmi1);
            _mm_store_si128((__m128i *)(dstp + x), xmi0);
        }
        srcp += src_stride;
        dstp += dst_stride;
    }
}
/**
 * SSE Implementation of \c cnsFormula (subroutine of cnsResponse).
 * \c scale, \c gaussI2 and \c regVar are 32bit floats (gaussI2 as A and B).
 * \c sobelX, \c sobelY, \c gaussI are signed short.
 * \c result is a packed vector of unsigned signed 8bit number with the x and y component
 * alternating and \c offset (unsigned char) added.
 */
ALWAYSINLINE static void cnsFormula(__m128i& result, __m128i sobelX, __m128i sobelY, __m128i& gaussI,
                                    const __m128& gaussI2A, const __m128& gaussI2B,
                                    const __m128& scale, const __m128& regVar, __m128i offset)
{
  __m128 gaussIA = _mm_cvtepi32_ps(_mm_unpacklo_epi16(gaussI, _mm_setzero_si128()));
  __m128 gaussIB = _mm_cvtepi32_ps(_mm_unpackhi_epi16(gaussI, _mm_setzero_si128()));

  __m128 factorA = _mm_add_ps(_mm_sub_ps(gaussI2A, _mm_mul_ps(gaussIA, gaussIA)), regVar); // gaussI2-gaussI^2+regVar
  __m128 factorB = _mm_add_ps(_mm_sub_ps(gaussI2B, _mm_mul_ps(gaussIB, gaussIB)), regVar);

  factorA = _mm_mul_ps(_mm_rsqrt_ps(factorA), scale); // scale/sqrt(gaussI2-gaussI^2+regVar)
  factorB = _mm_mul_ps(_mm_rsqrt_ps(factorB), scale);

  // (2^-11)*sobelX*(scale/sqrt(gaussI2-gaussI^2+regVar))
  __m128i factor = _mm_packs_epi32(_mm_cvtps_epi32(factorA), _mm_cvtps_epi32(factorB));
  __m128i resultXepi16 = _mm_mulhi_epi16(_mm_slli_epi16(sobelX, 5), factor);
  __m128i resultYepi16 = _mm_mulhi_epi16(_mm_slli_epi16(sobelY, 5), factor);

  // Convert to 8bit and interleave X and Y
  // the second argument of packs duplicates values to higher bytes, but these are ignored later, unpacklo interleaves X and Y
  __m128i resultepi8 = _mm_unpacklo_epi8(_mm_packs_epi16(resultXepi16, resultXepi16), _mm_packs_epi16(resultYepi16, resultYepi16));

  result = _mm_add_epi8(resultepi8, offset); // add offset, switching to epu8
}
Exemple #17
0
void demod_64qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols)
{
  float *symbolsPtr = (float*) symbols;
  __m128i *resultPtr = (__m128i*) llr;
  __m128 symbol1, symbol2, symbol3, symbol4;
  __m128i symbol_i1, symbol_i2, symbol_i3, symbol_i4, symbol_i, symbol_abs, symbol_abs2,symbol_12, symbol_34;
  __m128i offset1 = _mm_set1_epi8(4*SCALE_BYTE_CONV_QAM64/sqrt(42));
  __m128i offset2 = _mm_set1_epi8(2*SCALE_BYTE_CONV_QAM64/sqrt(42));
  __m128 scale_v = _mm_set1_ps(-SCALE_BYTE_CONV_QAM64);
  __m128i result11, result12, result13, result22, result21,result23, result31, result32, result33;

  __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,5,4,0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0);
  __m128i shuffle_negated_2 = _mm_set_epi8(11,10,0xff,0xff,0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff);
  __m128i shuffle_negated_3 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff);

  __m128i shuffle_abs_1 = _mm_set_epi8(5,4,0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0,0xff,0xff);
  __m128i shuffle_abs_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff,0xff,0xff);
  __m128i shuffle_abs_3 = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff,11,10);

  __m128i shuffle_abs2_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0,0xff,0xff,0xff,0xff);
  __m128i shuffle_abs2_2 = _mm_set_epi8(0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff,0xff,0xff,5,4);
  __m128i shuffle_abs2_3 = _mm_set_epi8(15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff,11,10,0xff,0xff);

  for (int i=0;i<nsymbols/8;i++) {
    symbol1   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol2   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol3   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol4   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v));
    symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v));
    symbol_i3 = _mm_cvtps_epi32(_mm_mul_ps(symbol3, scale_v));
    symbol_i4 = _mm_cvtps_epi32(_mm_mul_ps(symbol4, scale_v));
    symbol_12  = _mm_packs_epi32(symbol_i1, symbol_i2);
    symbol_34  = _mm_packs_epi32(symbol_i3, symbol_i4);
    symbol_i   = _mm_packs_epi16(symbol_12, symbol_34);

    symbol_abs  = _mm_abs_epi8(symbol_i);
    symbol_abs  = _mm_sub_epi8(symbol_abs, offset1);
    symbol_abs2 = _mm_sub_epi8(_mm_abs_epi8(symbol_abs), offset2);

    result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);
    result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);
    result13 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_1);

    result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);
    result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);
    result23 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_2);

    result31 = _mm_shuffle_epi8(symbol_i, shuffle_negated_3);
    result32 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_3);
    result33 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_3);

    _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result11, result12),result13)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result21, result22),result23)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result31, result32),result33)); resultPtr++;

  }
  for (int i=8*(nsymbols/8);i<nsymbols;i++) {
    float yre = (int8_t) (SCALE_BYTE_CONV_QAM64*crealf(symbols[i]));
    float yim = (int8_t) (SCALE_BYTE_CONV_QAM64*cimagf(symbols[i]));

    llr[6*i+0] = -yre;
    llr[6*i+1] = -yim;
    llr[6*i+2] = abs(yre)-4*SCALE_BYTE_CONV_QAM64/sqrt(42);
    llr[6*i+3] = abs(yim)-4*SCALE_BYTE_CONV_QAM64/sqrt(42);
    llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_BYTE_CONV_QAM64/sqrt(42);
    llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_BYTE_CONV_QAM64/sqrt(42);
  }
}
Exemple #18
0
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
                                       const uint16_t* const sharpen,
                                       const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
  const __m128i zero = _mm_setzero_si128();
  __m128i coeff0, coeff8;
  __m128i out0, out8;
  __m128i packed_out;

  // Load all inputs.
  // TODO(cduvivier): Make variable declarations and allocations aligned so that
  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
  const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
  const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
  const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
  const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);

  // extract sign(in)  (0x0000 if positive, 0xffff if negative)
  const __m128i sign0 = _mm_cmpgt_epi16(zero, in0);
  const __m128i sign8 = _mm_cmpgt_epi16(zero, in8);

  // coeff = abs(in) = (in ^ sign) - sign
  coeff0 = _mm_xor_si128(in0, sign0);
  coeff8 = _mm_xor_si128(in8, sign8);
  coeff0 = _mm_sub_epi16(coeff0, sign0);
  coeff8 = _mm_sub_epi16(coeff8, sign8);

  // coeff = abs(in) + sharpen
  if (sharpen != NULL) {
    const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
    const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
    coeff0 = _mm_add_epi16(coeff0, sharpen0);
    coeff8 = _mm_add_epi16(coeff8, sharpen8);
  }

  // out = (coeff * iQ + B) >> QFIX
  {
    // doing calculations with 32b precision (QFIX=17)
    // out = (coeff * iQ)
    const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
    const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
    const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
    const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
    // out = (coeff * iQ + B)
    const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
    const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
    const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
    const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
    out_00 = _mm_add_epi32(out_00, bias_00);
    out_04 = _mm_add_epi32(out_04, bias_04);
    out_08 = _mm_add_epi32(out_08, bias_08);
    out_12 = _mm_add_epi32(out_12, bias_12);
    // out = QUANTDIV(coeff, iQ, B, QFIX)
    out_00 = _mm_srai_epi32(out_00, QFIX);
    out_04 = _mm_srai_epi32(out_04, QFIX);
    out_08 = _mm_srai_epi32(out_08, QFIX);
    out_12 = _mm_srai_epi32(out_12, QFIX);

    // pack result as 16b
    out0 = _mm_packs_epi32(out_00, out_04);
    out8 = _mm_packs_epi32(out_08, out_12);

    // if (coeff > 2047) coeff = 2047
    out0 = _mm_min_epi16(out0, max_coeff_2047);
    out8 = _mm_min_epi16(out8, max_coeff_2047);
  }

  // get sign back (if (sign[j]) out_n = -out_n)
  out0 = _mm_xor_si128(out0, sign0);
  out8 = _mm_xor_si128(out8, sign8);
  out0 = _mm_sub_epi16(out0, sign0);
  out8 = _mm_sub_epi16(out8, sign8);

  // in = out * Q
  in0 = _mm_mullo_epi16(out0, q0);
  in8 = _mm_mullo_epi16(out8, q8);

  _mm_storeu_si128((__m128i*)&in[0], in0);
  _mm_storeu_si128((__m128i*)&in[8], in8);

  // zigzag the output before storing it.
  //
  // The zigzag pattern can almost be reproduced with a small sequence of
  // shuffles. After it, we only need to swap the 7th (ending up in third
  // position instead of twelfth) and 8th values.
  {
    __m128i outZ0, outZ8;
    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
    _mm_storeu_si128((__m128i*)&out[0], outZ0);
    _mm_storeu_si128((__m128i*)&out[8], outZ8);
    packed_out = _mm_packs_epi16(outZ0, outZ8);
  }
  {
    const int16_t outZ_12 = out[12];
    const int16_t outZ_3 = out[3];
    out[3] = outZ_12;
    out[12] = outZ_3;
  }

  // detect if all 'out' values are zeroes or not
  return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
}
void alphaBlendSSE_8u(Mat& src1, Mat& src2, Mat& alpha, Mat& dest)
{
	if(dest.empty())dest.create(src1.size(),CV_8U);

	const int imsize = (src1.size().area()/16);
	uchar* s1 = src1.data;
	uchar* s2 = src2.data;
	uchar* a = alpha.data;
	uchar* d = dest.data;

	const __m128i zero = _mm_setzero_si128();
	const __m128i amax = _mm_set1_epi8(char(255));
	int i=0;
	if(s1==d)
	{
		for(;i<imsize;++i)
		{
			__m128i ms1h = _mm_load_si128((__m128i*)(s1));
			__m128i ms2h = _mm_load_si128((__m128i*)(s2));
			__m128i mah = _mm_load_si128((__m128i*)(a));
			__m128i imah = _mm_sub_epi8(amax,mah);

			__m128i ms1l = _mm_unpacklo_epi8(ms1h, zero);
			ms1h = _mm_unpackhi_epi8(ms1h, zero);

			__m128i ms2l = _mm_unpacklo_epi8(ms2h, zero);
			ms2h = _mm_unpackhi_epi8(ms2h, zero);

			__m128i mal = _mm_unpacklo_epi8(mah, zero);
			mah = _mm_unpackhi_epi8(mah, zero);

			__m128i imal = _mm_unpacklo_epi8(imah, zero);
			imah = _mm_unpackhi_epi8(imah, zero);

			ms1l = _mm_mullo_epi16(ms1l,mal);
			ms2l = _mm_mullo_epi16(ms2l,imal);
			ms1l = _mm_add_epi16(ms1l,ms2l);
			//ms1l = _mm_srli_epi16(ms1l,8);
			ms1l = _mm_srai_epi16(ms1l,8);

			ms1h = _mm_mullo_epi16(ms1h,mah);
			ms2h = _mm_mullo_epi16(ms2h,imah);
			ms1h = _mm_add_epi16(ms1h,ms2h);
			//ms1h = _mm_srli_epi16(ms1h,8);
			ms1h = _mm_srai_epi16(ms1h,8);

			_mm_stream_si128((__m128i*)s1,_mm_packs_epi16(ms1l,ms1h));

			s1+=16;
			s2+=16;
			a+=16;
		}
	}
	else
	{
		for(;i<imsize;++i)
		{
			__m128i ms1h = _mm_load_si128((__m128i*)(s1));
			__m128i ms2h = _mm_load_si128((__m128i*)(s2));
			__m128i mah = _mm_load_si128((__m128i*)(a));
			__m128i imah = _mm_sub_epi8(amax,mah);

			__m128i ms1l = _mm_unpacklo_epi8(ms1h, zero);
			ms1h = _mm_unpackhi_epi8(ms1h, zero);

			__m128i ms2l = _mm_unpacklo_epi8(ms2h, zero);
			ms2h = _mm_unpackhi_epi8(ms2h, zero);

			__m128i mal = _mm_unpacklo_epi8(mah, zero);
			mah = _mm_unpackhi_epi8(mah, zero);

			__m128i imal = _mm_unpacklo_epi8(imah, zero);
			imah = _mm_unpackhi_epi8(imah, zero);

			ms1l = _mm_mullo_epi16(ms1l,mal);
			ms2l = _mm_mullo_epi16(ms2l,imal);
			ms1l = _mm_add_epi16(ms1l,ms2l);
			//ms1l = _mm_srli_epi16(ms1l,8);
			ms1l = _mm_srai_epi16(ms1l,8);

			ms1h = _mm_mullo_epi16(ms1h,mah);
			ms2h = _mm_mullo_epi16(ms2h,imah);
			ms1h = _mm_add_epi16(ms1h,ms2h);
			//ms1h = _mm_srli_epi16(ms1h,8);
			ms1h = _mm_srai_epi16(ms1h,8);

			_mm_store_si128((__m128i*)d,_mm_packs_epi16(ms1l,ms1h));

			s1+=16;
			s2+=16;
			a+=16;
			d+=16;
		}
	}

	{
		uchar* s1 = src1.data;
		uchar* s2 = src2.data;
		uchar* a = alpha.data;
		uchar* d = dest.data;
		for(int n=i*16;n<src1.size().area();n++)
		{
			d[n] = (a[n]*s1[n] + (255-a[n])*s2[n])>>8;
		}
	}
}
Exemple #20
0
// Simple quantization
static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
                             int n, const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
  const __m128i zero = _mm_set1_epi16(0);
  __m128i sign0, sign8;
  __m128i coeff0, coeff8;
  __m128i out0, out8;
  __m128i packed_out;

  // Load all inputs.
  // TODO(cduvivier): Make variable declarations and allocations aligned so that
  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
  const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
  const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
  const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
  const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
  const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
  const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);

  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
  sign0 = _mm_srai_epi16(in0, 15);
  sign8 = _mm_srai_epi16(in8, 15);

  // coeff = abs(in) = (in ^ sign) - sign
  coeff0 = _mm_xor_si128(in0, sign0);
  coeff8 = _mm_xor_si128(in8, sign8);
  coeff0 = _mm_sub_epi16(coeff0, sign0);
  coeff8 = _mm_sub_epi16(coeff8, sign8);

  // coeff = abs(in) + sharpen
  coeff0 = _mm_add_epi16(coeff0, sharpen0);
  coeff8 = _mm_add_epi16(coeff8, sharpen8);

  // if (coeff > 2047) coeff = 2047
  coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
  coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);

  // out = (coeff * iQ + B) >> QFIX;
  {
    // doing calculations with 32b precision (QFIX=17)
    // out = (coeff * iQ)
    __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
    __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
    __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
    __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
    // expand bias from 16b to 32b
    __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
    __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
    __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
    __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
    // out = (coeff * iQ + B)
    out_00 = _mm_add_epi32(out_00, bias_00);
    out_04 = _mm_add_epi32(out_04, bias_04);
    out_08 = _mm_add_epi32(out_08, bias_08);
    out_12 = _mm_add_epi32(out_12, bias_12);
    // out = (coeff * iQ + B) >> QFIX;
    out_00 = _mm_srai_epi32(out_00, QFIX);
    out_04 = _mm_srai_epi32(out_04, QFIX);
    out_08 = _mm_srai_epi32(out_08, QFIX);
    out_12 = _mm_srai_epi32(out_12, QFIX);
    // pack result as 16b
    out0 = _mm_packs_epi32(out_00, out_04);
    out8 = _mm_packs_epi32(out_08, out_12);
  }

  // get sign back (if (sign[j]) out_n = -out_n)
  out0 = _mm_xor_si128(out0, sign0);
  out8 = _mm_xor_si128(out8, sign8);
  out0 = _mm_sub_epi16(out0, sign0);
  out8 = _mm_sub_epi16(out8, sign8);

  // in = out * Q
  in0 = _mm_mullo_epi16(out0, q0);
  in8 = _mm_mullo_epi16(out8, q8);

  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
  {
    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
    in0 = _mm_and_si128(in0, cmp0);
    in8 = _mm_and_si128(in8, cmp8);
    _mm_storeu_si128((__m128i*)&in[0], in0);
    _mm_storeu_si128((__m128i*)&in[8], in8);
    out0 = _mm_and_si128(out0, cmp0);
    out8 = _mm_and_si128(out8, cmp8);
  }

  // zigzag the output before storing it.
  //
  // The zigzag pattern can almost be reproduced with a small sequence of
  // shuffles. After it, we only need to swap the 7th (ending up in third
  // position instead of twelfth) and 8th values.
  {
    __m128i outZ0, outZ8;
    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
    _mm_storeu_si128((__m128i*)&out[0], outZ0);
    _mm_storeu_si128((__m128i*)&out[8], outZ8);
    packed_out = _mm_packs_epi16(outZ0, outZ8);
  }
  {
    const int16_t outZ_12 = out[12];
    const int16_t outZ_3 = out[3];
    out[3] = outZ_12;
    out[12] = outZ_3;
  }

  // detect if all 'out' values are zeroes or not
  {
    int32_t tmp[4];
    _mm_storeu_si128((__m128i*)tmp, packed_out);
    if (n) {
      tmp[0] &= ~0xff;
    }
    return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
  }
}
static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
                                         ptrdiff_t src_pixels_per_line,
                                         uint8_t *output_ptr,
                                         ptrdiff_t output_pitch,
                                         uint32_t output_height,
                                         const int16_t *filter) {
  __m128i filtersReg;
  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
  __m256i srcReg32b1, srcReg32b2, filtersReg32;
  unsigned int i;
  ptrdiff_t src_stride, dst_stride;

  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  // converting the 16 bit (short) to 8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
  // have the same data in both lanes of a 256 bit register
  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);

  // duplicate only the first 16 bits (first and second byte)
  // across 256 bit register
  firstFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x100u));
  // duplicate only the second 16 bits (third and forth byte)
  // across 256 bit register
  secondFilters = _mm256_shuffle_epi8(filtersReg32,
                  _mm256_set1_epi16(0x302u));
  // duplicate only the third 16 bits (fifth and sixth byte)
  // across 256 bit register
  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x504u));
  // duplicate only the forth 16 bits (seventh and eighth byte)
  // across 256 bit register
  forthFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x706u));

  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);

  // multiple the size of the source and destination stride by two
  src_stride = src_pixels_per_line << 1;
  dst_stride = output_pitch << 1;
  for (i = output_height; i > 1; i-=2) {
    // load the 2 strides of source
    srcReg32b1 = _mm256_castsi128_si256(
                 _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
    srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
                 _mm_loadu_si128((const __m128i *)
                 (src_ptr+src_pixels_per_line-3)), 1);

    // filter the source buffer
    srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);

    // add and saturate the results together
    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);

    // filter the source buffer
    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);

    // add and saturate the results together
    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));

    // reading 2 strides of the next 16 bytes
    // (part of it was being read by earlier read)
    srcReg32b2 = _mm256_castsi128_si256(
                 _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
    srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
                 _mm_loadu_si128((const __m128i *)
                 (src_ptr+src_pixels_per_line+5)), 1);

    // add and saturate the results together
    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));

    // filter the source buffer
    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);

    // add and saturate the results together
    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);

    // filter the source buffer
    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);

    // add and saturate the results together
    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));


    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);

    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);

    // shift by 7 bit each 16 bit
    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
                                           srcRegFilt32b2_1);

    src_ptr+=src_stride;

    // save 16 bytes
    _mm_store_si128((__m128i*)output_ptr,
    _mm256_castsi256_si128(srcRegFilt32b1_1));

    // save the next 16 bits
    _mm_store_si128((__m128i*)(output_ptr+output_pitch),
    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
    output_ptr+=dst_stride;
  }

  // if the number of strides is odd.
  // process only 16 bytes
  if (i > 0) {
    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
    __m128i srcRegFilt2, srcRegFilt3;

    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));

    // filter the source buffer
    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
                    _mm256_castsi256_si128(filt1Reg));
    srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
                  _mm256_castsi256_si128(filt4Reg));

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
                    _mm256_castsi256_si128(firstFilters));
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
                  _mm256_castsi256_si128(forthFilters));

    // add and saturate the results together
    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);

    // filter the source buffer
    srcRegFilt3= _mm_shuffle_epi8(srcReg1,
                 _mm256_castsi256_si128(filt2Reg));
    srcRegFilt2= _mm_shuffle_epi8(srcReg1,
                 _mm256_castsi256_si128(filt3Reg));

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
                  _mm256_castsi256_si128(secondFilters));
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
                  _mm256_castsi256_si128(thirdFilters));

    // add and saturate the results together
    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));

    // reading the next 16 bytes
    // (part of it was being read by earlier read)
    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));

    // add and saturate the results together
    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));

    // filter the source buffer
    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
                    _mm256_castsi256_si128(filt1Reg));
    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
                  _mm256_castsi256_si128(filt4Reg));

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
                    _mm256_castsi256_si128(firstFilters));
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
                  _mm256_castsi256_si128(forthFilters));

    // add and saturate the results together
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);

    // filter the source buffer
    srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
                  _mm256_castsi256_si128(filt2Reg));
    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
                  _mm256_castsi256_si128(filt3Reg));

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
                  _mm256_castsi256_si128(secondFilters));
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
                  _mm256_castsi256_si128(thirdFilters));

    // add and saturate the results together
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));


    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
                    _mm256_castsi256_si128(addFilterReg64));

    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
                    _mm256_castsi256_si128(addFilterReg64));

    // shift by 7 bit each 16 bit
    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);

    // save 16 bytes
    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
  }
}
Exemple #22
0
/*
============
idSIMD_SSE::CmpLT

  dst[i] |= ( src0[i] < constant ) << bitNum;
============
*/
void VPCALL idSIMD_SSE2::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
	int i, cnt, pre, post;
	float *aligned;
	__m128 xmm0, xmm1;
	__m128i xmm0i;
	int cnt_l;
	char *src0_p;
	char *constant_p;
	char *dst_p;
	int mask_l;
	int dst_l;
	
	/* if the float array is not aligned on a 4 byte boundary */
	if ( ((int) src0) & 3 ) {
		/* unaligned memory access */
		pre = 0;
		cnt = count >> 2;
		post = count - (cnt<<2);

	/*
		__asm	mov			edx, cnt													
		__asm	test		edx, edx													
		__asm	je			doneCmp														
	*/
		cnt_l = cnt;
		if(cnt_l != 0) {
	/*
		__asm	push		ebx															
		__asm	neg			edx															
		__asm	mov			esi, src0													
		__asm	prefetchnta	[esi+64]													
		__asm	movss		xmm1, constant												
		__asm	shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )						
		__asm	mov			edi, dst													
		__asm	mov			cl, bitNum	
	*/
			cnt_l = -cnt_l;
			src0_p = (char *) src0;
			_mm_prefetch(src0_p+64, _MM_HINT_NTA);
			constant_p = (char *) &constant;
			xmm1 = _mm_load_ss((float *)constant_p);
			xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
			dst_p = (char *)dst;
	/*
			__asm loopNA:																	
	*/
			do {
	/*
		__asm	movups		xmm0, [esi]													
		__asm	prefetchnta	[esi+128]													
		__asm	cmpltps		xmm0, xmm1													
		__asm	movmskps	eax, xmm0																												\
		__asm	mov			ah, al														
		__asm	shr			ah, 1														
		__asm	mov			bx, ax														
		__asm	shl			ebx, 14														
		__asm	mov			bx, ax														
		__asm	and			ebx, 0x01010101												
		__asm	shl			ebx, cl														
		__asm	or			ebx, dword ptr [edi]										
		__asm	mov			dword ptr [edi], ebx										
		__asm	add			esi, 16														
		__asm	add			edi, 4														
		__asm	inc			edx															
		__asm	jl			loopNA														
		__asm	pop			ebx		
	*/
				xmm0 = _mm_loadu_ps((float *) src0_p);
				_mm_prefetch(src0_p+128, _MM_HINT_NTA);
				xmm0 = _mm_cmplt_ps(xmm0, xmm1);
				// Simplify using SSE2
				xmm0i = (__m128i) xmm0;
				xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
				xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
				mask_l = _mm_cvtsi128_si32(xmm0i);
				// End 
				mask_l = mask_l &  0x01010101;
				mask_l = mask_l << bitNum;
				dst_l  = *((int *) dst_p);
				mask_l = mask_l | dst_l;
				*((int *) dst_p) = mask_l;
				src0_p = src0_p + 16;
				dst_p = dst_p + 4;
				cnt_l = cnt_l + 1;
			} while (cnt_l < 0);
		}													
	}																					
static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
                                         ptrdiff_t src_pitch,
                                         uint8_t *output_ptr,
                                         ptrdiff_t out_pitch,
                                         uint32_t output_height,
                                         const int16_t *filter) {
  __m128i filtersReg;
  __m256i addFilterReg64;
  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
  __m256i srcReg32b11, srcReg32b12, filtersReg32;
  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
  unsigned int i;
  ptrdiff_t src_stride, dst_stride;

  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the
  // same data in both lanes of 128 bit register.
  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
  // have the same data in both lanes of a 256 bit register
  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);

  // duplicate only the first 16 bits (first and second byte)
  // across 256 bit register
  firstFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x100u));
  // duplicate only the second 16 bits (third and forth byte)
  // across 256 bit register
  secondFilters = _mm256_shuffle_epi8(filtersReg32,
                  _mm256_set1_epi16(0x302u));
  // duplicate only the third 16 bits (fifth and sixth byte)
  // across 256 bit register
  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x504u));
  // duplicate only the forth 16 bits (seventh and eighth byte)
  // across 256 bit register
  forthFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x706u));

  // multiple the size of the source and destination stride by two
  src_stride = src_pitch << 1;
  dst_stride = out_pitch << 1;

  // load 16 bytes 7 times in stride of src_pitch
  srcReg32b1 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr)));
  srcReg32b2 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
  srcReg32b3 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
  srcReg32b4 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
  srcReg32b5 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
  srcReg32b6 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
  srcReg32b7 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));

  // have each consecutive loads on the same 256 register
  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
               _mm256_castsi256_si128(srcReg32b2), 1);
  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
               _mm256_castsi256_si128(srcReg32b3), 1);
  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
               _mm256_castsi256_si128(srcReg32b4), 1);
  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
               _mm256_castsi256_si128(srcReg32b5), 1);
  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
               _mm256_castsi256_si128(srcReg32b6), 1);
  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
               _mm256_castsi256_si128(srcReg32b7), 1);

  // merge every two consecutive registers except the last one
  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);

  // save
  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);

  // save
  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);

  // save
  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);

  // save
  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);


  for (i = output_height; i > 1; i-=2) {
     // load the last 2 loads of 16 bytes and have every two
     // consecutive loads in the same 256 bit register
     srcReg32b8 = _mm256_castsi128_si256(
     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
     _mm256_castsi256_si128(srcReg32b8), 1);
     srcReg32b9 = _mm256_castsi128_si256(
     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
     _mm256_castsi256_si128(srcReg32b9), 1);

     // merge every two consecutive registers
     // save
     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);

     // multiply 2 adjacent elements with the filter and add the result
     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);

     // add and saturate the results together
     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);

     // multiply 2 adjacent elements with the filter and add the result
     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);

     // add and saturate the results together
     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
                   _mm256_max_epi16(srcReg32b8, srcReg32b12));

     // multiply 2 adjacent elements with the filter and add the result
     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);

     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);

     // multiply 2 adjacent elements with the filter and add the result
     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);

     // add and saturate the results together
     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
                  _mm256_min_epi16(srcReg32b8, srcReg32b12));
     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
                  _mm256_max_epi16(srcReg32b8, srcReg32b12));

     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);

     // shift by 7 bit each 16 bit
     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);

     // shrink to 8 bit each 16 bits, the first lane contain the first
     // convolve result and the second lane contain the second convolve
     // result
     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);

     src_ptr+=src_stride;

     // save 16 bytes
     _mm_store_si128((__m128i*)output_ptr,
     _mm256_castsi256_si128(srcReg32b1));

     // save the next 16 bits
     _mm_store_si128((__m128i*)(output_ptr+out_pitch),
     _mm256_extractf128_si256(srcReg32b1, 1));

     output_ptr+=dst_stride;

     // save part of the registers for next strides
     srcReg32b10 = srcReg32b11;
     srcReg32b1 = srcReg32b3;
     srcReg32b11 = srcReg32b2;
     srcReg32b3 = srcReg32b5;
     srcReg32b2 = srcReg32b4;
     srcReg32b5 = srcReg32b7;
     srcReg32b7 = srcReg32b9;
  }
  if (i > 0) {
    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
    // load the last 16 bytes
    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));

    // merge the last 2 results together
    srcRegFilt4 = _mm_unpacklo_epi8(
                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
    srcRegFilt7 = _mm_unpackhi_epi8(
                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
                  _mm256_castsi256_si128(firstFilters));
    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
                  _mm256_castsi256_si128(forthFilters));
    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
                  _mm256_castsi256_si128(firstFilters));
    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
                  _mm256_castsi256_si128(forthFilters));

    // add and saturate the results together
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);


    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
                  _mm256_castsi256_si128(secondFilters));
    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
                  _mm256_castsi256_si128(secondFilters));

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
                  _mm256_castsi256_si128(thirdFilters));
    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
                  _mm256_castsi256_si128(thirdFilters));

    // add and saturate the results together
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                  _mm_min_epi16(srcRegFilt4, srcRegFilt6));
    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
                  _mm_min_epi16(srcRegFilt5, srcRegFilt7));

    // add and saturate the results together
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                  _mm_max_epi16(srcRegFilt4, srcRegFilt6));
    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
                  _mm_max_epi16(srcRegFilt5, srcRegFilt7));


    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                  _mm256_castsi256_si128(addFilterReg64));
    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
                  _mm256_castsi256_si128(addFilterReg64));

    // shift by 7 bit each 16 bit
    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);

    // save 16 bytes
    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
  }
}
void
lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
                      const union lp_rast_cmd_arg arg)
{
   const struct lp_rast_triangle *tri = arg.triangle.tri;
   const struct lp_rast_plane *plane = GET_PLANES(tri);
   int x = (arg.triangle.plane_mask & 0xff) + task->x;
   int y = (arg.triangle.plane_mask >> 8) + task->y;
   unsigned i, j;

   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
   unsigned nr = 0;

   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
   __m128i zero = _mm_setzero_si128();

   __m128i c;
   __m128i dcdx;
   __m128i dcdy;
   __m128i rej4;

   __m128i dcdx2;
   __m128i dcdx3;
   
   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
   __m128i unused;
   
   transpose4_epi32(&p0, &p1, &p2, &zero,
                    &c, &dcdx, &dcdy, &rej4);

   /* Adjust dcdx;
    */
   dcdx = _mm_sub_epi32(zero, dcdx);

   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
   rej4 = _mm_slli_epi32(rej4, 2);

   /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
   c = _mm_sub_epi32(c, _mm_set1_epi32(1));
   rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1));

   dcdx2 = _mm_add_epi32(dcdx, dcdx);
   dcdx3 = _mm_add_epi32(dcdx2, dcdx);

   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
                    &span_0, &span_1, &span_2, &unused);

   for (i = 0; i < 4; i++) {
      __m128i cx = c;

      for (j = 0; j < 4; j++) {
         __m128i c4rej = _mm_add_epi32(cx, rej4);
         __m128i rej_masks = _mm_srai_epi32(c4rej, 31);

         /* if (is_zero(rej_masks)) */
         if (_mm_movemask_epi8(rej_masks) == 0) {
            __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
            __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
            __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);

            __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);

            __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
            __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
            __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));

            __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
            __m128i c_01 = _mm_packs_epi32(c_0, c_1);

            __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
            __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
            __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));

            __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);

            __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
            __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
            __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));

            __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
            __m128i c_23 = _mm_packs_epi32(c_2, c_3);
            __m128i c_0123 = _mm_packs_epi16(c_01, c_23);

            unsigned mask = _mm_movemask_epi8(c_0123);

            out[nr].i = i;
            out[nr].j = j;
            out[nr].mask = mask;
            if (mask != 0xffff)
               nr++;
         }
         cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
      }

      c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
   }

   for (i = 0; i < nr; i++)
      lp_rast_shade_quads_mask(task,
                               &tri->inputs,
                               x + 4 * out[i].j,
                               y + 4 * out[i].i,
                               0xffff & ~out[i].mask);
}
void
lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
                     const union lp_rast_cmd_arg arg)
{
   const struct lp_rast_triangle *tri = arg.triangle.tri;
   const struct lp_rast_plane *plane = GET_PLANES(tri);
   unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
   unsigned y = (arg.triangle.plane_mask >> 8) + task->y;

   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
   __m128i zero = _mm_setzero_si128();

   __m128i c;
   __m128i dcdx;
   __m128i dcdy;

   __m128i dcdx2;
   __m128i dcdx3;
   
   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
   __m128i unused;

   transpose4_epi32(&p0, &p1, &p2, &zero,
                    &c, &dcdx, &dcdy, &unused);

   /* Adjust dcdx;
    */
   dcdx = _mm_sub_epi32(zero, dcdx);

   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));

   /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
   c = _mm_sub_epi32(c, _mm_set1_epi32(1));

   dcdx2 = _mm_add_epi32(dcdx, dcdx);
   dcdx3 = _mm_add_epi32(dcdx2, dcdx);

   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
                    &span_0, &span_1, &span_2, &unused);


   {
      __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
      __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
      __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
      
      __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);

      __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
      __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
      __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));

      __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
      __m128i c_01 = _mm_packs_epi32(c_0, c_1);

      __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
      __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
      __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));

      __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);

      __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
      __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
      __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));

      __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
      __m128i c_23 = _mm_packs_epi32(c_2, c_3);
      __m128i c_0123 = _mm_packs_epi16(c_01, c_23);

      unsigned mask = _mm_movemask_epi8(c_0123);

      if (mask != 0xffff)
         lp_rast_shade_quads_mask(task,
                                  &tri->inputs,
                                  x,
                                  y,
                                  0xffff & ~mask);
   }
}
void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
                                          unsigned int src_pixels_per_line,
                                          unsigned char *output_ptr,
                                          unsigned int output_pitch,
                                          unsigned int output_height,
                                          int16_t *filter) {
  __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
  __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
  unsigned int i;

  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
  filtersReg = _mm_loadu_si128((__m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

  // duplicate only the first 16 bits (first and second byte)
  // across 128 bit register
  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
  // duplicate only the second 16 bits (third and forth byte)
  // across 128 bit register
  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
  // duplicate only the third 16 bits (fifth and sixth byte)
  // across 128 bit register
  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
  // duplicate only the forth 16 bits (seventh and eighth byte)
  // across 128 bit register
  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));

  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);

  for (i = 0; i < output_height; i++) {
    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));

    // filter the source buffer
    srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);

    // add and saturate the results together
    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);

    // filter the source buffer
    srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);

    // add and saturate the results together
    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
                                   _mm_min_epi16(srcRegFilt3, srcRegFilt2));

    // reading the next 16 bytes.
    // (part of it was being read by earlier read)
    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));

    // add and saturate the results together
    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
                                   _mm_max_epi16(srcRegFilt3, srcRegFilt2));

    // filter the source buffer
    srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);

    // add and saturate the results together
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);

    // filter the source buffer
    srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);

    // add and saturate the results together
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
    _mm_max_epi16(srcRegFilt3, srcRegFilt2));

    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);

    // shift by 7 bit each 16 bit
    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);

    src_ptr+=src_pixels_per_line;

    // save 16 bytes
    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);

    output_ptr+=output_pitch;
  }
}
void aom_filter_block1d8_h8_intrin_ssse3(
    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
  __m128i addFilterReg64, filtersReg, minReg;
  unsigned int i;

  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);

  // duplicate only the first 16 bits (first and second byte)
  // across 128 bit register
  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
  // duplicate only the second 16 bits (third and forth byte)
  // across 128 bit register
  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
  // duplicate only the third 16 bits (fifth and sixth byte)
  // across 128 bit register
  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
  // duplicate only the forth 16 bits (seventh and eighth byte)
  // across 128 bit register
  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));

  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);

  for (i = 0; i < output_height; i++) {
    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));

    // filter the source buffer
    srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
    srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);

    // filter the source buffer
    srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
    srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);

    // add and saturate all the results together
    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);

    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);

    // shift by 7 bit each 16 bits
    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);

    // shrink to 8 bit each 16 bits
    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);

    src_ptr += src_pixels_per_line;

    // save only 8 bytes
    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);

    output_ptr += output_pitch;
  }
}
void aom_filter_block1d8_v8_intrin_ssse3(
    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i addFilterReg64, filtersReg, minReg;
  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
  __m128i srcReg8;
  unsigned int i;

  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);

  // duplicate only the first 16 bits in the filter
  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
  // duplicate only the second 16 bits in the filter
  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
  // duplicate only the third 16 bits in the filter
  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
  // duplicate only the forth 16 bits in the filter
  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));

  // load the first 7 rows of 8 bytes
  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));

  for (i = 0; i < output_height; i++) {
    // load the last 8 bytes
    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));

    // merge the result together
    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);

    // merge the result together
    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);

    // add and saturate the results together
    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);

    // shift by 7 bit each 16 bit
    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);

    // shrink to 8 bit each 16 bits
    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);

    src_ptr += src_pitch;

    // shift down a row
    srcReg1 = srcReg2;
    srcReg2 = srcReg3;
    srcReg3 = srcReg4;
    srcReg4 = srcReg5;
    srcReg5 = srcReg6;
    srcReg6 = srcReg7;
    srcReg7 = srcReg8;

    // save only 8 bytes convolve result
    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);

    output_ptr += out_pitch;
  }
}
void aom_filter_block1d4_h8_intrin_ssse3(
    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i firstFilters, secondFilters, shuffle1, shuffle2;
  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
  __m128i addFilterReg64, filtersReg, srcReg, minReg;
  unsigned int i;

  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);

  // duplicate only the first 16 bits in the filter into the first lane
  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
  // duplicate only the third 16 bit in the filter into the first lane
  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
  // duplicate only the seconds 16 bits in the filter into the second lane
  // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
  // duplicate only the forth 16 bits in the filter into the second lane
  // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);

  // loading the local filters
  shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);

  for (i = 0; i < output_height; i++) {
    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));

    // filter the source buffer
    srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
    srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);

    // extract the higher half of the lane
    srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
    srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);

    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);

    // add and saturate all the results together
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);

    // shift by 7 bit each 16 bits
    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);

    // shrink to 8 bit each 16 bits
    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
    src_ptr += src_pixels_per_line;

    // save only 4 bytes
    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);

    output_ptr += output_pitch;
  }
}
Exemple #30
0
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
                                       const uint16_t* const sharpen,
                                       const VP8Matrix* const mtx) {
    const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
    const __m128i zero = _mm_setzero_si128();
    __m128i out0, out8;
    __m128i packed_out;

    // Load all inputs.
    // TODO(cduvivier): Make variable declarations and allocations aligned so that
    //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
    __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
    __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
    const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
    const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
    const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
    const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);

    // coeff = abs(in)
    __m128i coeff0 = _mm_abs_epi16(in0);
    __m128i coeff8 = _mm_abs_epi16(in8);

    // coeff = abs(in) + sharpen
    if (sharpen != NULL) {
        const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
        const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
        coeff0 = _mm_add_epi16(coeff0, sharpen0);
        coeff8 = _mm_add_epi16(coeff8, sharpen8);
    }

    // out = (coeff * iQ + B) >> QFIX
    {
        // doing calculations with 32b precision (QFIX=17)
        // out = (coeff * iQ)
        const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
        const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
        const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
        const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
        __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
        __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
        __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
        __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
        // out = (coeff * iQ + B)
        const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
        const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
        const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
        const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
        out_00 = _mm_add_epi32(out_00, bias_00);
        out_04 = _mm_add_epi32(out_04, bias_04);
        out_08 = _mm_add_epi32(out_08, bias_08);
        out_12 = _mm_add_epi32(out_12, bias_12);
        // out = QUANTDIV(coeff, iQ, B, QFIX)
        out_00 = _mm_srai_epi32(out_00, QFIX);
        out_04 = _mm_srai_epi32(out_04, QFIX);
        out_08 = _mm_srai_epi32(out_08, QFIX);
        out_12 = _mm_srai_epi32(out_12, QFIX);

        // pack result as 16b
        out0 = _mm_packs_epi32(out_00, out_04);
        out8 = _mm_packs_epi32(out_08, out_12);

        // if (coeff > 2047) coeff = 2047
        out0 = _mm_min_epi16(out0, max_coeff_2047);
        out8 = _mm_min_epi16(out8, max_coeff_2047);
    }

    // put sign back
    out0 = _mm_sign_epi16(out0, in0);
    out8 = _mm_sign_epi16(out8, in8);

    // in = out * Q
    in0 = _mm_mullo_epi16(out0, q0);
    in8 = _mm_mullo_epi16(out8, q8);

    _mm_storeu_si128((__m128i*)&in[0], in0);
    _mm_storeu_si128((__m128i*)&in[8], in8);

    // zigzag the output before storing it. The re-ordering is:
    //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
    // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
    // There's only two misplaced entries ([8] and [7]) that are crossing the
    // reg's boundaries.
    // We use pshufb instead of pshuflo/pshufhi.
    {
        const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6);
        const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1);
        const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo);
        const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7);  // extract #7
        const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7);
        const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1);
        const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi);
        const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8);  // extract #8
        const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8);
        const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7);
        _mm_storeu_si128((__m128i*)&out[0], out_z0);
        _mm_storeu_si128((__m128i*)&out[8], out_z8);
        packed_out = _mm_packs_epi16(out_z0, out_z8);
    }

    // detect if all 'out' values are zeroes or not
    return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
}