static void GF_FUNC_ALIGN VS_CC
float_to_dst_8bit(const float *srcp, uint8_t *dstp, int width, int height,
                  int src_stride, int dst_stride, float th, int bits)
{
    __m128 tmax = _mm_set1_ps(th);

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x += 16) {
            __m128 xmf0 = _mm_cmpge_ps(_mm_load_ps(srcp + x), tmax);
            __m128 xmf1 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 4), tmax);
            __m128 xmf2 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 8), tmax);
            __m128 xmf3 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 12), tmax);
            __m128i xmi0 = _mm_packs_epi32(_mm_castps_si128(xmf0),
                                           _mm_castps_si128(xmf1));
            __m128i xmi1 = _mm_packs_epi32(_mm_castps_si128(xmf2),
                                           _mm_castps_si128(xmf3));
            xmi0 = _mm_packs_epi16(xmi0, xmi1);
            _mm_store_si128((__m128i *)(dstp + x), xmi0);
        }
        srcp += src_stride;
        dstp += dst_stride;
    }
}
Beispiel #2
0
inline FORCE_INLINE __m128i export_i30_u16(__m128i lo, __m128i hi, uint16_t limit)
{
	const __m128i round = _mm_set1_epi32(1 << 13);

	lo = _mm_add_epi32(lo, round);
	hi = _mm_add_epi32(hi, round);

	lo = _mm_srai_epi32(lo, 14);
	hi = _mm_srai_epi32(hi, 14);

	lo = _mm_packs_epi32(lo, hi);

	return lo;
}
Beispiel #3
0
void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                    uint8_t* dst) {
  int n;
  for (n = 0; n < 32; n += 2) {
    const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
    const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
    _mm_storel_epi64((__m128i*)dst, tmp3);
    dst += 4 * 2;
  }
}
Beispiel #4
0
static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
                                            int tile_width, int tile_height,
                                            int green_to_blue, int red_to_blue,
                                            int histo[]) {
  const __m128i mults_r = _mm_set_epi16(
      CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,
      CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0);
  const __m128i mults_g = _mm_set_epi16(
      0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue),
      0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue));
  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
  const __m128i mask_b = _mm_set1_epi32(0x0000ff);  // blue mask
  int y;
  for (y = 0; y < tile_height; ++y) {
    const uint32_t* const src = argb + y * stride;
    int i, x;
    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
      uint16_t values[SPAN];
      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x +        0]);
      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
      const __m128i A0 = _mm_slli_epi16(in0, 8);        // r 0  | b 0
      const __m128i A1 = _mm_slli_epi16(in1, 8);
      const __m128i B0 = _mm_and_si128(in0, mask_g);    // 0 0  | g 0
      const __m128i B1 = _mm_and_si128(in1, mask_g);
      const __m128i C0 = _mm_mulhi_epi16(A0, mults_r);  // x db | 0 0
      const __m128i C1 = _mm_mulhi_epi16(A1, mults_r);
      const __m128i D0 = _mm_mulhi_epi16(B0, mults_g);  // 0 0  | x db
      const __m128i D1 = _mm_mulhi_epi16(B1, mults_g);
      const __m128i E0 = _mm_sub_epi8(in0, D0);         // x x  | x b'
      const __m128i E1 = _mm_sub_epi8(in1, D1);
      const __m128i F0 = _mm_srli_epi32(C0, 16);        // 0 0  | x db
      const __m128i F1 = _mm_srli_epi32(C1, 16);
      const __m128i G0 = _mm_sub_epi8(E0, F0);          // 0 0  | x b'
      const __m128i G1 = _mm_sub_epi8(E1, F1);
      const __m128i H0 = _mm_and_si128(G0, mask_b);     // 0 0  | 0 b
      const __m128i H1 = _mm_and_si128(G1, mask_b);
      const __m128i I = _mm_packs_epi32(H0, H1);        // 0 b' | 0 b'
      _mm_storeu_si128((__m128i*)values, I);
      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
    }
  }
  {
    const int left_over = tile_width & (SPAN - 1);
    if (left_over > 0) {
      VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
                                       left_over, tile_height,
                                       green_to_blue, red_to_blue, histo);
    }
  }
}
Beispiel #5
0
/* -----------------------------------
 *      weighted_merge_luma_yuy2
 * -----------------------------------
 */
static void weighted_merge_luma_yuy2_sse2(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height, int weight, int invweight)
{
  __m128i round_mask = _mm_set1_epi32(0x4000);
  __m128i mask = _mm_set_epi16(weight, invweight, weight, invweight, weight, invweight, weight, invweight);
  __m128i luma_mask = _mm_set1_epi16(0x00FF);
#pragma warning(push)
#pragma warning(disable: 4309)
  __m128i chroma_mask = _mm_set1_epi16(0xFF00);
#pragma warning(pop)

  int wMod16 = (width/16) * 16;

  for (int y = 0; y < height; y++) {
    for (int x = 0; x < wMod16; x += 16) {
      __m128i px1 = _mm_load_si128(reinterpret_cast<const __m128i*>(src+x)); //V1 Y3 U1 Y2 V0 Y1 U0 Y0
      __m128i px2 = _mm_load_si128(reinterpret_cast<const __m128i*>(luma+x)); //v1 y3 u1 y2 v0 y1 u0 y0

      __m128i src_lo = _mm_unpacklo_epi16(px1, px2); //v0 y1 V0 Y1 u0 y0 U0 Y0
      __m128i src_hi = _mm_unpackhi_epi16(px1, px2); 

      src_lo = _mm_and_si128(src_lo, luma_mask); //00 v0 00 V0 00 u0 00 U0
      src_hi = _mm_and_si128(src_hi, luma_mask); 

      src_lo = _mm_madd_epi16(src_lo, mask);
      src_hi = _mm_madd_epi16(src_hi, mask);

      src_lo = _mm_add_epi32(src_lo, round_mask);
      src_hi = _mm_add_epi32(src_hi, round_mask);

      src_lo = _mm_srli_epi32(src_lo, 15);
      src_hi = _mm_srli_epi32(src_hi, 15);

      __m128i result_luma = _mm_packs_epi32(src_lo, src_hi);

      __m128i result_chroma = _mm_and_si128(px1, chroma_mask);
      __m128i result = _mm_or_si128(result_chroma, result_luma);

      _mm_store_si128(reinterpret_cast<__m128i*>(src+x), result);
    }

    for (int x = wMod16; x < width; x+=2) {
      src[x] = (luma[x] * weight + src[x] * invweight + 16384) >> 15;
    }

    src += pitch;
    luma += luma_pitch;
  }
}
Beispiel #6
0
inline void ClampBufferToS16(s16 *out, const s32 *in, size_t size) {
#ifdef _M_SSE
	// Size will always be 16-byte aligned as the hwBlockSize is.
	while (size >= 8) {
		__m128i in1 = _mm_loadu_si128((__m128i *)in);
		__m128i in2 = _mm_loadu_si128((__m128i *)(in + 4));
		__m128i packed = _mm_packs_epi32(in1, in2);
		_mm_storeu_si128((__m128i *)out, packed);
		out += 8;
		in += 8;
		size -= 8;
	}
#endif
	for (size_t i = 0; i < size; i++)
		out[i] = clamp_s16(in[i]);
}
Beispiel #7
0
/* -----------------------------------
 *     weighted_merge_chroma_yuy2
 * -----------------------------------
 */
static void weighted_merge_chroma_yuy2_sse2(BYTE *src, const BYTE *chroma, int pitch, int chroma_pitch,int width, int height, int weight, int invweight )
{
  __m128i round_mask = _mm_set1_epi32(0x4000);
  __m128i mask = _mm_set_epi16(weight, invweight, weight, invweight, weight, invweight, weight, invweight);
  __m128i luma_mask = _mm_set1_epi16(0x00FF);

  int wMod16 = (width/16) * 16;

  for (int y = 0; y < height; y++) {
    for (int x = 0; x < wMod16; x += 16) {
      __m128i px1 = _mm_load_si128(reinterpret_cast<const __m128i*>(src+x)); 
      __m128i px2 = _mm_load_si128(reinterpret_cast<const __m128i*>(chroma+x)); 

      __m128i src_lo = _mm_unpacklo_epi16(px1, px2); 
      __m128i src_hi = _mm_unpackhi_epi16(px1, px2); 

      src_lo = _mm_srli_epi16(src_lo, 8);
      src_hi = _mm_srli_epi16(src_hi, 8);

      src_lo = _mm_madd_epi16(src_lo, mask);
      src_hi = _mm_madd_epi16(src_hi, mask);

      src_lo = _mm_add_epi32(src_lo, round_mask);
      src_hi = _mm_add_epi32(src_hi, round_mask);

      src_lo = _mm_srli_epi32(src_lo, 15);
      src_hi = _mm_srli_epi32(src_hi, 15);

      __m128i result_chroma = _mm_packs_epi32(src_lo, src_hi);
      result_chroma = _mm_slli_epi16(result_chroma, 8);

      __m128i result_luma = _mm_and_si128(px1, luma_mask);
      __m128i result = _mm_or_si128(result_chroma, result_luma);

      _mm_store_si128(reinterpret_cast<__m128i*>(src+x), result);
    }

    for (int x = wMod16; x < width; x+=2) {
      src[x+1] = (chroma[x+1] * weight + src[x+1] * invweight + 16384) >> 15;
    }

    src += pitch;
    chroma += chroma_pitch;
  }
}
static void recon8x8(int **m7, imgpel **mb_rec, imgpel **mpr, int max_imgpel_value, int ioff)
{
  int j;
  int    *m_tr  = NULL;
  imgpel *m_rec = NULL;
  imgpel *m_prd = NULL;
  __m128i mm_dq2 = _mm_set1_epi16((1<<(DQ_BITS_8-1)));
  __m128i mm0  = _mm_set1_epi16(0);
  __m128i mm7, mm72, mmPred, tmp;

  for( j = 0; j < 8; j++)
  {

	m_tr = (*m7++) + ioff;
    m_rec = (*mb_rec++) + ioff;
    m_prd = (*mpr++) + ioff;

	mm7 = _mm_loadu_si128((__m128i*) m_tr);
	mm72 = _mm_loadu_si128((__m128i*) (m_tr+4));
	mm7 = _mm_packs_epi32(mm7, mm72);

	mmPred = _mm_loadu_si128((__m128i*) m_prd);
	mmPred = _mm_unpacklo_epi8(mmPred, mm0);

	tmp = _mm_add_epi16(mm7, mm_dq2);
	tmp = _mm_srai_epi16(tmp, DQ_BITS_8);
	tmp = _mm_add_epi16(tmp, mmPred);

	tmp = _mm_packus_epi16(tmp, tmp);
	_mm_storel_epi64((__m128i*) m_rec, tmp);



	/*
    *m_rec++ = (imgpel) iClip1(max_imgpel_value, (*m_prd++) + rshift_rnd_sf(*m_tr++, DQ_BITS_8)); 
    *m_rec++ = (imgpel) iClip1(max_imgpel_value, (*m_prd++) + rshift_rnd_sf(*m_tr++, DQ_BITS_8)); 
    *m_rec++ = (imgpel) iClip1(max_imgpel_value, (*m_prd++) + rshift_rnd_sf(*m_tr++, DQ_BITS_8)); 
    *m_rec++ = (imgpel) iClip1(max_imgpel_value, (*m_prd++) + rshift_rnd_sf(*m_tr++, DQ_BITS_8)); 
    *m_rec++ = (imgpel) iClip1(max_imgpel_value, (*m_prd++) + rshift_rnd_sf(*m_tr++, DQ_BITS_8)); 
    *m_rec++ = (imgpel) iClip1(max_imgpel_value, (*m_prd++) + rshift_rnd_sf(*m_tr++, DQ_BITS_8)); 
    *m_rec++ = (imgpel) iClip1(max_imgpel_value, (*m_prd++) + rshift_rnd_sf(*m_tr++, DQ_BITS_8)); 
    *m_rec   = (imgpel) iClip1(max_imgpel_value, (*m_prd  ) + rshift_rnd_sf(*m_tr  , DQ_BITS_8)); 
	*/
  }
}
Beispiel #9
0
void audio_convert_float_to_s16_SSE2(int16_t *out,
      const float *in, size_t samples)
{
   __m128 factor = _mm_set1_ps((float)0x7fff);
   size_t i;
   for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8)
   {
      __m128 input[2] = { _mm_loadu_ps(in + 0), _mm_loadu_ps(in + 4) };
      __m128 res[2] = { _mm_mul_ps(input[0], factor), _mm_mul_ps(input[1], factor) };

      __m128i ints[2] = { _mm_cvtps_epi32(res[0]), _mm_cvtps_epi32(res[1]) };
      __m128i packed = _mm_packs_epi32(ints[0], ints[1]);

      _mm_storeu_si128((__m128i *)out, packed);
   }

   audio_convert_float_to_s16_C(out, in, samples - i);
}
Beispiel #10
0
//	const __m128i	mask_lsb = _mm_set1_epi16 (0x00FF);
//	const __m128i	sign_bit = _mm_set1_epi16 (-0x8000);
//	const __m128	offset   = _mm_set1_ps (-32768);
void	ProxyRwSse2 <SplFmt_STACK16>::write_flt (const Ptr::Type &ptr, const __m128 &src0, const __m128 &src1, const __m128i &mask_lsb, const __m128i &sign_bit, const __m128 &offset)
{
	__m128			val_03_f = _mm_add_ps (src0, offset);
	__m128			val_47_f = _mm_add_ps (src1, offset);

	const __m128i	val_03 = _mm_cvtps_epi32 (val_03_f);
	const __m128i	val_47 = _mm_cvtps_epi32 (val_47_f);

	__m128i			val = _mm_packs_epi32 (val_03, val_47);
	val = _mm_xor_si128 (val, sign_bit);

	fstb::ToolsSse2::store_8_16ml (
		ptr._msb_ptr,
		ptr._lsb_ptr,
		val,
		mask_lsb
	);
}
Beispiel #11
0
void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) {
    float *symbolsPtr = (float*) symbols;
  __m128i *resultPtr = (__m128i*) llr;
  __m128 symbol1, symbol2; 
  __m128i symbol_i1, symbol_i2, symbol_i, symbol_abs;
  __m128i offset = _mm_set1_epi16(2*SCALE_SHORT_CONV_QAM16/sqrt(10));
  __m128i result11, result12, result22, result21; 
  __m128 scale_v = _mm_set1_ps(-SCALE_SHORT_CONV_QAM16);
  __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0);
  __m128i shuffle_abs_1 = _mm_set_epi8(7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0,0xff,0xff,0xff,0xff);

  __m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8);
  __m128i shuffle_abs_2 = _mm_set_epi8(15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8,0xff,0xff,0xff,0xff);

  for (int i=0;i<nsymbols/4;i++) {
    symbol1   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol2   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v));
    symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v));
    symbol_i  = _mm_packs_epi32(symbol_i1, symbol_i2);
    
    symbol_abs  = _mm_abs_epi16(symbol_i);
    symbol_abs  = _mm_sub_epi16(symbol_abs, offset);
    
    result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);  
    result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);  

    result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);  
    result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);  

    _mm_store_si128(resultPtr, _mm_or_si128(result11, result12)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(result21, result22)); resultPtr++;
  }
  // Demodulate last symbols 
  for (int i=4*(nsymbols/4);i<nsymbols;i++) {
    short yre = (short) (SCALE_SHORT_CONV_QAM16*crealf(symbols[i]));
    short yim = (short) (SCALE_SHORT_CONV_QAM16*cimagf(symbols[i]));
        
    llr[4*i+0] = -yre;
    llr[4*i+1] = -yim;
    llr[4*i+2] = abs(yre)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);
    llr[4*i+3] = abs(yim)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);    
  }
}
Beispiel #12
0
static int ExtractAlpha(const uint8_t* argb, int argb_stride,
                        int width, int height,
                        uint8_t* alpha, int alpha_stride) {
  // alpha_and stores an 'and' operation of all the alpha[] values. The final
  // value is not 0xff if any of the alpha[] is not equal to 0xff.
  uint32_t alpha_and = 0xff;
  int i, j;
  const __m128i a_mask = _mm_set1_epi32(0xffu);  // to preserve alpha
  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
  __m128i all_alphas = all_0xff;

  // We must be able to access 3 extra bytes after the last written byte
  // 'src[4 * width - 4]', because we don't know if alpha is the first or the
  // last byte of the quadruplet.
  const int limit = (width - 1) & ~7;

  for (j = 0; j < height; ++j) {
    const __m128i* src = (const __m128i*)argb;
    for (i = 0; i < limit; i += 8) {
      // load 32 argb bytes
      const __m128i a0 = _mm_loadu_si128(src + 0);
      const __m128i a1 = _mm_loadu_si128(src + 1);
      const __m128i b0 = _mm_and_si128(a0, a_mask);
      const __m128i b1 = _mm_and_si128(a1, a_mask);
      const __m128i c0 = _mm_packs_epi32(b0, b1);
      const __m128i d0 = _mm_packus_epi16(c0, c0);
      // store
      _mm_storel_epi64((__m128i*)&alpha[i], d0);
      // accumulate eight alpha 'and' in parallel
      all_alphas = _mm_and_si128(all_alphas, d0);
      src += 2;
    }
    for (; i < width; ++i) {
      const uint32_t alpha_value = argb[4 * i];
      alpha[i] = alpha_value;
      alpha_and &= alpha_value;
    }
    argb += argb_stride;
    alpha += alpha_stride;
  }
  // Combine the eight alpha 'and' into a 8-bit mask.
  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
  return (alpha_and == 0xff);
}
Beispiel #13
0
void Convert444toNV12(LPBYTE input, int width, int inPitch, int outPitch, int height, int startY, int endY, LPBYTE *output)
{
    LPBYTE lumPlane     = output[0];
    LPBYTE uvPlane		= output[1];

    __m128i lumMask = _mm_set1_epi32(0x0000FF00);
    __m128i uvMask = _mm_set1_epi16(0x00FF);

    for(int y=startY; y<endY; y+=2)
    {
        int yPos    = y*inPitch;
        int uvYPos = (y>>1)*outPitch;
        int lumYPos = y*outPitch;

        for(int x=0; x<width; x+=4)
        {
            LPBYTE lpImagePos = input+yPos+(x*4);
            int uvPos  = uvYPos + x;
            int lumPos0 = lumYPos + x;
            int lumPos1 = lumPos0 + outPitch;

            __m128i line1 = _mm_load_si128((__m128i*)lpImagePos);
            __m128i line2 = _mm_load_si128((__m128i*)(lpImagePos+inPitch));

            //pack lum vals
            {
                __m128i packVal = _mm_packs_epi32(_mm_srli_si128(_mm_and_si128(line1, lumMask), 1), _mm_srli_si128(_mm_and_si128(line2, lumMask), 1));
                packVal = _mm_packus_epi16(packVal, packVal);

                *(LPUINT)(lumPlane+lumPos0) = packVal.m128i_u32[0];
                *(LPUINT)(lumPlane+lumPos1) = packVal.m128i_u32[1];
            }

            //do average, pack UV vals
            {
                __m128i addVal = _mm_add_epi64(_mm_and_si128(line1, uvMask), _mm_and_si128(line2, uvMask));
                __m128i avgVal = _mm_srai_epi16(_mm_add_epi64(addVal, _mm_shuffle_epi32(addVal, _MM_SHUFFLE(2, 3, 0, 1))), 2);
                avgVal = _mm_shuffle_epi32(avgVal, _MM_SHUFFLE(3, 1, 2, 0));

                *(LPUINT)(uvPlane+uvPos) = _mm_packus_epi16(avgVal, avgVal).m128i_u32[0];
            }
        }
    }
}
void
interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
    gint len, const gpointer icp, gint astride)
{
  gint i = 0;
  gint16 *o = op, *a = ap, *ic = icp;
  __m128i ta, tb, tl1, tl2, th1, th2;
  __m128i f[2];
  const gint16 *c[4] = { (gint16 *) ((gint8 *) a + 0 * astride),
    (gint16 *) ((gint8 *) a + 1 * astride),
    (gint16 *) ((gint8 *) a + 2 * astride),
    (gint16 *) ((gint8 *) a + 3 * astride)
  };

  f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]);
  f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]);

  for (; i < len; i += 8) {
    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
    tb = _mm_load_si128 ((__m128i *) (c[1] + i));

    tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]);
    th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]);

    ta = _mm_load_si128 ((__m128i *) (c[2] + i));
    tb = _mm_load_si128 ((__m128i *) (c[3] + i));

    tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]);
    th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]);

    tl1 = _mm_add_epi32 (tl1, tl2);
    th1 = _mm_add_epi32 (th1, th2);

    tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
    th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));

    tl1 = _mm_srai_epi32 (tl1, PRECISION_S16);
    th1 = _mm_srai_epi32 (th1, PRECISION_S16);

    tl1 = _mm_packs_epi32 (tl1, th1);
    _mm_store_si128 ((__m128i *) (o + i), tl1);
  }
}
Beispiel #15
0
inline Pixel GetPixelSSE(const Image* img, float x, float y)
{
 const int stride = img->width;
 const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel

 // Load the data (2 pixels in one load)
 __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); 
 __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); 

 __m128 weight = CalcWeights(x, y);

 // extend to 16bit
 p12 = _mm_unpacklo_epi8(p12, _mm_setzero_si128());
 p34 = _mm_unpacklo_epi8(p34, _mm_setzero_si128());

 // convert floating point weights to 16bit integer
 weight = _mm_mul_ps(weight, CONST_256); 
 __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1
         weighti = _mm_packs_epi32(weighti, _mm_setzero_si128()); // 32->16bit

 // prepare the weights
 __m128i w12 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(1, 1, 0, 0));
 __m128i w34 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(3, 3, 2, 2));
 w12 = _mm_unpacklo_epi16(w12, w12); // w2 w2 w2 w2 w1 w1 w1 w1
 w34 = _mm_unpacklo_epi16(w34, w34); // w4 w4 w4 w4 w3 w3 w3 w3
 
 // multiply each pixel with its weight (2 pixel per SSE mul)
 __m128i L12 = _mm_mullo_epi16(p12, w12);
 __m128i L34 = _mm_mullo_epi16(p34, w34);

 // sum the results
 __m128i L1234 = _mm_add_epi16(L12, L34); 
 __m128i Lhi = _mm_shuffle_epi32(L1234, _MM_SHUFFLE(3, 2, 3, 2));
 __m128i L = _mm_add_epi16(L1234, Lhi);
  
 // convert back to 8bit
 __m128i L8 = _mm_srli_epi16(L, 8); // divide by 256
 L8 = _mm_packus_epi16(L8, _mm_setzero_si128());
 
 // return
 return _mm_cvtsi128_si32(L8);
}
Beispiel #16
0
Datei: dsp.cpp Projekt: taqu/opus
    void conv_Short2ToShort1(void* dst, const void* s, s32 numSamples)
    {
        LSshort* d = reinterpret_cast<LSshort*>(dst);
        const LSshort* src = reinterpret_cast<const LSshort*>(s);

        s32 num = numSamples >> 2; //8個のshortをまとめて処理
        s32 offset = num << 2;
        s32 rem = numSamples - offset;

        const __m128i izero = _mm_setzero_si128();
        __declspec(align(16)) LSshort tmp[8];

        const LSshort* p = src;
        LSshort* q = d;
        for(s32 i=0; i<num; ++i){
            //32bit整数r0, r1に変換
            __m128i t0 = _mm_loadu_si128((const __m128i*)p);
            __m128i t1 = _mm_cmpgt_epi16(izero, t0);
            __m128i r0 = _mm_unpackhi_epi16(t0, t1);
            __m128i r1 = _mm_unpacklo_epi16(t0, t1);

            __m128i r2 = _mm_add_epi32(r0, _mm_shuffle_epi32(r0, _MM_SHUFFLE(2, 3, 0, 1)));
            __m128i r3 = _mm_add_epi32(r1, _mm_shuffle_epi32(r1, _MM_SHUFFLE(2, 3, 0, 1)));

            r2 = _mm_srai_epi32(r2, 1);
            r3 = _mm_srai_epi32(r3, 1);
            __m128i r4 = _mm_packs_epi32(r3, r2);
            _mm_store_si128((__m128i*)tmp, r4);
            q[0] = tmp[0];
            q[1] = tmp[2];
            q[2] = tmp[4];
            q[3] = tmp[6];
            p += 8;
            q += 4;
        }

        for(s32 i=0; i<rem; ++i){
            s32 j = i<<1;
            s32 t = (p[j+0] + p[j+1]) >> 1;
            q[i] = static_cast<LSshort>(t);
        }
    }
Beispiel #17
0
static void CollectColorRedTransforms(const uint32_t* argb, int stride,
                                      int tile_width, int tile_height,
                                      int green_to_red, int histo[]) {
  const __m128i mults_g = _mm_set_epi16(
      0, CST_5b(green_to_red), 0, CST_5b(green_to_red),
      0, CST_5b(green_to_red), 0, CST_5b(green_to_red));
  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
  const __m128i mask = _mm_set1_epi32(0xff);

  int y;
  for (y = 0; y < tile_height; ++y) {
    const uint32_t* const src = argb + y * stride;
    int i, x;
    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
      uint16_t values[SPAN];
      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x +        0]);
      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
      const __m128i A0 = _mm_and_si128(in0, mask_g);    // 0 0  | g 0
      const __m128i A1 = _mm_and_si128(in1, mask_g);
      const __m128i B0 = _mm_srli_epi32(in0, 16);       // 0 0  | x r
      const __m128i B1 = _mm_srli_epi32(in1, 16);
      const __m128i C0 = _mm_mulhi_epi16(A0, mults_g);  // 0 0  | x dr
      const __m128i C1 = _mm_mulhi_epi16(A1, mults_g);
      const __m128i E0 = _mm_sub_epi8(B0, C0);          // x x  | x r'
      const __m128i E1 = _mm_sub_epi8(B1, C1);
      const __m128i F0 = _mm_and_si128(E0, mask);       // 0 0  | 0 r'
      const __m128i F1 = _mm_and_si128(E1, mask);
      const __m128i I = _mm_packs_epi32(F0, F1);
      _mm_storeu_si128((__m128i*)values, I);
      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
    }
  }
  {
    const int left_over = tile_width & (SPAN - 1);
    if (left_over > 0) {
      VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
                                      left_over, tile_height,
                                      green_to_red, histo);
    }
  }
}
Beispiel #18
0
inline Pixel GetPixelSSE3(const Image<Pixel>* img, float x, float y)
{
 const int stride = img->width;
 const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel

 // Load the data (2 pixels in one load)
 __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); 
 __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); 

 __m128 weight = CalcWeights(x, y);

 // convert RGBA RGBA RGBA RGAB to RRRR GGGG BBBB AAAA (AoS to SoA)
 __m128i p1234 = _mm_unpacklo_epi8(p12, p34);
 __m128i p34xx = _mm_unpackhi_epi64(p1234, _mm_setzero_si128());
 __m128i p1234_8bit = _mm_unpacklo_epi8(p1234, p34xx);

 // extend to 16bit 
 __m128i pRG = _mm_unpacklo_epi8(p1234_8bit, _mm_setzero_si128());
 __m128i pBA = _mm_unpackhi_epi8(p1234_8bit, _mm_setzero_si128());
 
 // convert weights to integer
 weight = _mm_mul_ps(weight, CONST_256); 
 __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1
         weighti = _mm_packs_epi32(weighti, weighti); // 32->2x16bit

 //outRG = [w1*R1 + w2*R2 | w3*R3 + w4*R4 | w1*G1 + w2*G2 | w3*G3 + w4*G4]
 __m128i outRG = _mm_madd_epi16(pRG, weighti);
 //outBA = [w1*B1 + w2*B2 | w3*B3 + w4*B4 | w1*A1 + w2*A2 | w3*A3 + w4*A4]
 __m128i outBA = _mm_madd_epi16(pBA, weighti);

 // horizontal add that will produce the output values (in 32bit)
 __m128i out = _mm_hadd_epi32(outRG, outBA);
 out = _mm_srli_epi32(out, 8); // divide by 256
 
 // convert 32bit->8bit
 out = _mm_packus_epi32(out, _mm_setzero_si128());
 out = _mm_packus_epi16(out, _mm_setzero_si128());

 // return
 return _mm_cvtsi128_si32(out);
}
static void GF_FUNC_ALIGN VS_CC
float_to_dst_gb_16bit(const float *srcp, uint8_t *d, int width, int height,
                      int src_stride, int dst_stride, float th, int bits)
{
    uint16_t *dstp = (uint16_t *)d;
    dst_stride /= 2;
    __m128i tmax = _mm_set1_epi32((1 << bits) - 1);

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x += 8) {
            __m128i xmm0 = _mm_cvtps_epi32(_mm_load_ps(srcp + x));
            __m128i xmm1 = _mm_cvtps_epi32(_mm_load_ps(srcp + x + 4));
            xmm0 = _mm_packs_epi32(mm_min_epi32(tmax, xmm0),
                                   mm_min_epi32(tmax, xmm1));

            _mm_store_si128((__m128i *)(dstp + x), xmm0);
        }
        srcp += src_stride;
        dstp += dst_stride;
    }
}
Beispiel #20
0
Datei: dsp.cpp Projekt: taqu/opus
    void conv_Float1ToShort2(void* dst, const void* s, s32 numSamples)
    {
        LSshort* d = reinterpret_cast<LSshort*>(dst);
        const LSfloat* src = reinterpret_cast<const LSfloat*>(s);

        s32 num = numSamples >> 2; //4個のfloatをまとめて処理
        s32 offset = num << 2;
        s32 rem = numSamples - offset;

        const __m128 fcoff = _mm_set1_ps(32768.0f);
        __declspec(align(16)) LSshort tmp[8];

        const LSfloat* p = src;
        LSshort* q = d;
        for(s32 i=0; i<num; ++i){
            __m128 f32_0 = _mm_mul_ps(_mm_loadu_ps(p), fcoff);
            __m128i s32_0 = _mm_cvtps_epi32(f32_0);
            __m128i s16_0 = _mm_packs_epi32(s32_0, s32_0);

            _mm_store_si128((__m128i*)tmp, s16_0);

            q[0] = tmp[0];
            q[1] = tmp[0];
            q[2] = tmp[1];
            q[3] = tmp[1];
            q[4] = tmp[2];
            q[5] = tmp[2];
            q[6] = tmp[3];
            q[7] = tmp[3];
            p += 4;
            q += 8;
        }

        for(s32 i=0; i<rem; ++i){
            s32 j=i<<1;
            q[j+0] = q[j+1] = toShort(p[i]);
        }
    }
Beispiel #21
0
inline void ClampBufferToS16(s16 *out, const s32 *in, size_t size, s8 volShift) {
#ifdef _M_SSE
	// Size will always be 16-byte aligned as the hwBlockSize is.
	while (size >= 8) {
		__m128i in1 = _mm_loadu_si128((__m128i *)in);
		__m128i in2 = _mm_loadu_si128((__m128i *)(in + 4));
		__m128i packed = _mm_packs_epi32(in1, in2);
		if (useShift) {
			packed = _mm_srai_epi16(packed, volShift);
		}
		_mm_storeu_si128((__m128i *)out, packed);
		out += 8;
		in += 8;
		size -= 8;
	}
#elif PPSSPP_ARCH(ARM_NEON)
	int16x4_t signedVolShift = vdup_n_s16 (-volShift); // Can only dynamic-shift right, but by a signed integer
	while (size >= 8) {
		int32x4_t in1 = vld1q_s32(in);
		int32x4_t in2 = vld1q_s32(in + 4);
		int16x4_t packed1 = vqmovn_s32(in1);
		int16x4_t packed2 = vqmovn_s32(in2);
		if (useShift) {
			packed1 = vshl_s16(packed1, signedVolShift);
			packed2 = vshl_s16(packed2, signedVolShift);
		}
		vst1_s16(out, packed1);
		vst1_s16(out + 4, packed2);
		out += 8;
		in += 8;
		size -= 8;
	}
#endif
	// This does the remainder if SIMD was used, otherwise it does it all.
	for (size_t i = 0; i < size; i++) {
		out[i] = clamp_s16(useShift ? (in[i] >> volShift) : in[i]);
	}
}
Beispiel #22
0
static void YuvToArgbRowSSE2(const uint8_t* y,
                             const uint8_t* u, const uint8_t* v,
                             uint8_t* dst, int len) {
  int n;
  for (n = 0; n + 2 <= len; n += 2) {
    const __m128i uv_0 = LoadUVPart(u[0], v[0]);
    const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
    const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(2, 1, 0, 3));
    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(2, 1, 0, 3));
    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
    _mm_storel_epi64((__m128i*)dst, tmp3);
    dst += 4 * 2;
    y += 2;
    ++u;
    ++v;
  }
  // Finish off
  if (len & 1) {
    VP8YuvToArgb(y[0], u[0], v[0], dst);
  }
}
Beispiel #23
0
/* Modified from volk_32f_s32f_convert_16i_a_simd2. Removed clipping */
void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, uint32_t len)
{
#ifdef LV_HAVE_SSE
  unsigned int number = 0;

  const unsigned int eighthPoints = len / 8;

  const float* inputVectorPtr = (const float*)x;
  int16_t* outputVectorPtr = z;

  __m128 vScalar = _mm_set_ps1(scale);
  __m128 inputVal1, inputVal2;
  __m128i intInputVal1, intInputVal2;
  __m128 ret1, ret2;

  for(;number < eighthPoints; number++){
    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;

    ret1 = _mm_mul_ps(inputVal1, vScalar);
    ret2 = _mm_mul_ps(inputVal2, vScalar);

    intInputVal1 = _mm_cvtps_epi32(ret1);
    intInputVal2 = _mm_cvtps_epi32(ret2);

    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);

    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
    outputVectorPtr += 8;
  }

  number = eighthPoints * 8;
  for(; number < len; number++){
    z[number] = (int16_t) (x[number] * scale);
  }
#endif
}
Beispiel #24
0
Datei: dsp.cpp Projekt: taqu/opus
    void conv_Float2ToShort1(void* dst, const void* s, s32 numSamples)
    {
        LSshort* d = reinterpret_cast<LSshort*>(dst);
        const LSfloat* src = reinterpret_cast<const LSfloat*>(s);

        s32 num = numSamples >> 2; //4個のfloatをまとめて処理
        s32 offset = num << 2;
        s32 rem = numSamples - offset;

        const __m128 fcoff = _mm_set1_ps(32768.0f*0.5f); //half
        __declspec(align(16)) LSshort tmp[8];

        const LSfloat* p = src;
        LSshort* q = d;
        for(s32 i=0; i<num; ++i){
            __m128 f32_0 = _mm_loadu_ps(p);
            __m128 f32_1 = _mm_add_ps(f32_0, _mm_shuffle_ps(f32_0, f32_0, _MM_SHUFFLE(2, 3, 0, 1)));
            __m128 f32_2 = _mm_mul_ps(f32_1, fcoff);

            __m128i s32_0 = _mm_cvtps_epi32(f32_2);
            __m128i s16_0 = _mm_packs_epi32(s32_0, s32_0);

            _mm_store_si128((__m128i*)tmp, s16_0);

            q[0] = tmp[0];
            q[1] = tmp[2];
            p += 4;
            q += 2;
        }

        for(s32 i=0; i<rem; ++i){
            s32 j = i<<1;
            f32 v = 0.5f*(src[j+0] + src[j+1]);
            q[i] = toShort(v);
        }
    }
Beispiel #25
0
/**
 * SSE Implementation of \c cnsFormula (subroutine of cnsResponse).
 * \c scale, \c gaussI2 and \c regVar are 32bit floats (gaussI2 as A and B).
 * \c sobelX, \c sobelY, \c gaussI are signed short.
 * \c result is a packed vector of unsigned signed 8bit number with the x and y component
 * alternating and \c offset (unsigned char) added.
 */
ALWAYSINLINE static void cnsFormula(__m128i& result, __m128i sobelX, __m128i sobelY, __m128i& gaussI,
                                    const __m128& gaussI2A, const __m128& gaussI2B,
                                    const __m128& scale, const __m128& regVar, __m128i offset)
{
  __m128 gaussIA = _mm_cvtepi32_ps(_mm_unpacklo_epi16(gaussI, _mm_setzero_si128()));
  __m128 gaussIB = _mm_cvtepi32_ps(_mm_unpackhi_epi16(gaussI, _mm_setzero_si128()));

  __m128 factorA = _mm_add_ps(_mm_sub_ps(gaussI2A, _mm_mul_ps(gaussIA, gaussIA)), regVar); // gaussI2-gaussI^2+regVar
  __m128 factorB = _mm_add_ps(_mm_sub_ps(gaussI2B, _mm_mul_ps(gaussIB, gaussIB)), regVar);

  factorA = _mm_mul_ps(_mm_rsqrt_ps(factorA), scale); // scale/sqrt(gaussI2-gaussI^2+regVar)
  factorB = _mm_mul_ps(_mm_rsqrt_ps(factorB), scale);

  // (2^-11)*sobelX*(scale/sqrt(gaussI2-gaussI^2+regVar))
  __m128i factor = _mm_packs_epi32(_mm_cvtps_epi32(factorA), _mm_cvtps_epi32(factorB));
  __m128i resultXepi16 = _mm_mulhi_epi16(_mm_slli_epi16(sobelX, 5), factor);
  __m128i resultYepi16 = _mm_mulhi_epi16(_mm_slli_epi16(sobelY, 5), factor);

  // Convert to 8bit and interleave X and Y
  // the second argument of packs duplicates values to higher bytes, but these are ignored later, unpacklo interleaves X and Y
  __m128i resultepi8 = _mm_unpacklo_epi8(_mm_packs_epi16(resultXepi16, resultXepi16), _mm_packs_epi16(resultYepi16, resultYepi16));

  result = _mm_add_epi8(resultepi8, offset); // add offset, switching to epu8
}
Beispiel #26
0
//vz optimized template specialization
template<> void
cvtScale_<short, short, float>( const short* src, size_t sstep,
           short* dst, size_t dstep, Size size,
           float scale, float shift )   
{
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);

	for( ; size.height--; src += sstep, dst += dstep )
    {
        int x = 0;
		#if CV_SSE2
			if(USE_SSE2)
            {
                __m128 scale128 = _mm_set1_ps (scale);
                __m128 shift128 = _mm_set1_ps (shift);
				for(; x <= size.width - 8; x += 8 )
				{
					__m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
                    __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
					__m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
                    __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
                    rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
                    rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
					r0 = _mm_cvtps_epi32(rf0);
					r1 = _mm_cvtps_epi32(rf1);
    				r0 = _mm_packs_epi32(r0, r1);
					_mm_storeu_si128((__m128i*)(dst + x), r0);
				}    
			}
        #endif

        for(; x < size.width; x++ )
            dst[x] = saturate_cast<short>(src[x]*scale + shift);
    } 
}
Beispiel #27
0
/*
 * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
 *
 * Notice:
 * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
 * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
 *   numbers of DD bit
 * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
 * - don't support ol_flags for rss and csum err
 */
static inline uint16_t
_recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
		uint16_t nb_pkts, uint8_t *split_packet)
{
	volatile union ixgbe_adv_rx_desc *rxdp;
	struct ixgbe_rx_entry *sw_ring;
	uint16_t nb_pkts_recd;
	int pos;
	uint64_t var;
	__m128i shuf_msk;
	__m128i crc_adjust = _mm_set_epi16(
				0, 0, 0,    /* ignore non-length fields */
				-rxq->crc_len, /* sub crc on data_len */
				0,          /* ignore high-16bits of pkt_len */
				-rxq->crc_len, /* sub crc on pkt_len */
				0, 0            /* ignore pkt_type field */
			);
	__m128i dd_check, eop_check;

	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);

	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);

	/* Just the act of getting into the function from the application is
	 * going to cost about 7 cycles
	 */
	rxdp = rxq->rx_ring + rxq->rx_tail;

	_mm_prefetch((const void *)rxdp, _MM_HINT_T0);

	/* See if we need to rearm the RX queue - gives the prefetch a bit
	 * of time to act
	 */
	if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH)
		ixgbe_rxq_rearm(rxq);

	/* Before we start moving massive data around, check to see if
	 * there is actually a packet available
	 */
	if (!(rxdp->wb.upper.status_error &
				rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
		return 0;

	/* 4 packets DD mask */
	dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL);

	/* 4 packets EOP mask */
	eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL);

	/* mask to shuffle from desc. to mbuf */
	shuf_msk = _mm_set_epi8(
		7, 6, 5, 4,  /* octet 4~7, 32bits rss */
		15, 14,      /* octet 14~15, low 16 bits vlan_macip */
		13, 12,      /* octet 12~13, 16 bits data_len */
		0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
		13, 12,      /* octet 12~13, low 16 bits pkt_len */
		0xFF, 0xFF,  /* skip 32 bit pkt_type */
		0xFF, 0xFF
		);

	/* Cache is empty -> need to scan the buffer rings, but first move
	 * the next 'n' mbufs into the cache
	 */
	sw_ring = &rxq->sw_ring[rxq->rx_tail];

	/* A. load 4 packet in one loop
	 * [A*. mask out 4 unused dirty field in desc]
	 * B. copy 4 mbuf point from swring to rx_pkts
	 * C. calc the number of DD bits among the 4 packets
	 * [C*. extract the end-of-packet bit, if requested]
	 * D. fill info. from desc to mbuf
	 */
	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
			pos += RTE_IXGBE_DESCS_PER_LOOP,
			rxdp += RTE_IXGBE_DESCS_PER_LOOP) {
		__m128i descs[RTE_IXGBE_DESCS_PER_LOOP];
		__m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
		__m128i zero, staterr, sterr_tmp1, sterr_tmp2;
		__m128i mbp1, mbp2; /* two mbuf pointer in one XMM reg. */

		/* B.1 load 1 mbuf point */
		mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]);

		/* Read desc statuses backwards to avoid race condition */
		/* A.1 load 4 pkts desc */
		descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3));

		/* B.2 copy 2 mbuf point into rx_pkts  */
		_mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1);

		/* B.1 load 1 mbuf point */
		mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]);

		descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2));
		/* B.1 load 2 mbuf point */
		descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1));
		descs[0] = _mm_loadu_si128((__m128i *)(rxdp));

		/* B.2 copy 2 mbuf point into rx_pkts  */
		_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);

		if (split_packet) {
			rte_mbuf_prefetch_part2(rx_pkts[pos]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
		}

		/* avoid compiler reorder optimization */
		rte_compiler_barrier();

		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
		pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);
		pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk);

		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
		pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk);
		pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk);

		/* C.1 4=>2 filter staterr info only */
		sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]);
		/* C.1 4=>2 filter staterr info only */
		sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]);

		/* set ol_flags with vlan packet type */
		desc_to_olflags_v(descs, &rx_pkts[pos]);

		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
		pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust);
		pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust);

		/* C.2 get 4 pkts staterr value  */
		zero = _mm_xor_si128(dd_check, dd_check);
		staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2);

		/* D.3 copy final 3,4 data to rx_pkts */
		_mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1,
				pkt_mb4);
		_mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1,
				pkt_mb3);

		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
		pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust);
		pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust);

		/* C* extract and record EOP bit */
		if (split_packet) {
			__m128i eop_shuf_mask = _mm_set_epi8(
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF,
					0x04, 0x0C, 0x00, 0x08
					);

			/* and with mask to extract bits, flipping 1-0 */
			__m128i eop_bits = _mm_andnot_si128(staterr, eop_check);
			/* the staterr values are not in order, as the count
			 * count of dd bits doesn't care. However, for end of
			 * packet tracking, we do care, so shuffle. This also
			 * compresses the 32-bit values to 8-bit
			 */
			eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask);
			/* store the resulting 32-bit value */
			*(int *)split_packet = _mm_cvtsi128_si32(eop_bits);
			split_packet += RTE_IXGBE_DESCS_PER_LOOP;

			/* zero-out next pointers */
			rx_pkts[pos]->next = NULL;
			rx_pkts[pos + 1]->next = NULL;
			rx_pkts[pos + 2]->next = NULL;
			rx_pkts[pos + 3]->next = NULL;
		}

		/* C.3 calc available number of desc */
		staterr = _mm_and_si128(staterr, dd_check);
		staterr = _mm_packs_epi32(staterr, zero);

		/* D.3 copy final 1,2 data to rx_pkts */
		_mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1,
				pkt_mb2);
		_mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1,
				pkt_mb1);

		/* C.4 calc avaialbe number of desc */
		var = __builtin_popcountll(_mm_cvtsi128_si64(staterr));
		nb_pkts_recd += var;
		if (likely(var != RTE_IXGBE_DESCS_PER_LOOP))
			break;
	}

	/* Update our internal tail pointer */
	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);

	return nb_pkts_recd;
}
Beispiel #28
0
// Simple quantization
static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
                             int n, const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
  const __m128i zero = _mm_set1_epi16(0);
  __m128i sign0, sign8;
  __m128i coeff0, coeff8;
  __m128i out0, out8;
  __m128i packed_out;

  // Load all inputs.
  // TODO(cduvivier): Make variable declarations and allocations aligned so that
  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
  const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
  const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
  const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
  const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
  const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
  const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);

  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
  sign0 = _mm_srai_epi16(in0, 15);
  sign8 = _mm_srai_epi16(in8, 15);

  // coeff = abs(in) = (in ^ sign) - sign
  coeff0 = _mm_xor_si128(in0, sign0);
  coeff8 = _mm_xor_si128(in8, sign8);
  coeff0 = _mm_sub_epi16(coeff0, sign0);
  coeff8 = _mm_sub_epi16(coeff8, sign8);

  // coeff = abs(in) + sharpen
  coeff0 = _mm_add_epi16(coeff0, sharpen0);
  coeff8 = _mm_add_epi16(coeff8, sharpen8);

  // if (coeff > 2047) coeff = 2047
  coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
  coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);

  // out = (coeff * iQ + B) >> QFIX;
  {
    // doing calculations with 32b precision (QFIX=17)
    // out = (coeff * iQ)
    __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
    __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
    __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
    __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
    // expand bias from 16b to 32b
    __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
    __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
    __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
    __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
    // out = (coeff * iQ + B)
    out_00 = _mm_add_epi32(out_00, bias_00);
    out_04 = _mm_add_epi32(out_04, bias_04);
    out_08 = _mm_add_epi32(out_08, bias_08);
    out_12 = _mm_add_epi32(out_12, bias_12);
    // out = (coeff * iQ + B) >> QFIX;
    out_00 = _mm_srai_epi32(out_00, QFIX);
    out_04 = _mm_srai_epi32(out_04, QFIX);
    out_08 = _mm_srai_epi32(out_08, QFIX);
    out_12 = _mm_srai_epi32(out_12, QFIX);
    // pack result as 16b
    out0 = _mm_packs_epi32(out_00, out_04);
    out8 = _mm_packs_epi32(out_08, out_12);
  }

  // get sign back (if (sign[j]) out_n = -out_n)
  out0 = _mm_xor_si128(out0, sign0);
  out8 = _mm_xor_si128(out8, sign8);
  out0 = _mm_sub_epi16(out0, sign0);
  out8 = _mm_sub_epi16(out8, sign8);

  // in = out * Q
  in0 = _mm_mullo_epi16(out0, q0);
  in8 = _mm_mullo_epi16(out8, q8);

  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
  {
    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
    in0 = _mm_and_si128(in0, cmp0);
    in8 = _mm_and_si128(in8, cmp8);
    _mm_storeu_si128((__m128i*)&in[0], in0);
    _mm_storeu_si128((__m128i*)&in[8], in8);
    out0 = _mm_and_si128(out0, cmp0);
    out8 = _mm_and_si128(out8, cmp8);
  }

  // zigzag the output before storing it.
  //
  // The zigzag pattern can almost be reproduced with a small sequence of
  // shuffles. After it, we only need to swap the 7th (ending up in third
  // position instead of twelfth) and 8th values.
  {
    __m128i outZ0, outZ8;
    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
    _mm_storeu_si128((__m128i*)&out[0], outZ0);
    _mm_storeu_si128((__m128i*)&out[8], outZ8);
    packed_out = _mm_packs_epi16(outZ0, outZ8);
  }
  {
    const int16_t outZ_12 = out[12];
    const int16_t outZ_3 = out[3];
    out[3] = outZ_12;
    out[12] = outZ_3;
  }

  // detect if all 'out' values are zeroes or not
  {
    int32_t tmp[4];
    _mm_storeu_si128((__m128i*)tmp, packed_out);
    if (n) {
      tmp[0] &= ~0xff;
    }
    return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
  }
}
Beispiel #29
0
static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
                           int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i seven = _mm_set1_epi16(7);
  const __m128i k7500 = _mm_set1_epi32(7500);
  const __m128i k14500 = _mm_set1_epi32(14500);
  const __m128i k51000 = _mm_set1_epi32(51000);
  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
                                           5352,  2217, 5352,  2217);
  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
                                           2217, -5352, 2217, -5352);

  __m128i v01, v32;

  // Difference between src and ref and initial transpose.
  {
    // Load src and convert to 16b.
    const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]);
    const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]);
    const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]);
    const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]);
    const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
    const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
    const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
    const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
    // Load ref and convert to 16b.
    const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
    const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
    const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
    const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
    const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
    const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
    const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
    const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
    // Compute difference.
    const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
    const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
    const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
    const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);

    // Transpose.
    // 00 01 02 03   0 0 0 0
    // 10 11 12 13   0 0 0 0
    // 20 21 22 23   0 0 0 0
    // 30 31 32 33   0 0 0 0
    const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3);
    // 00 10 01 11   02 12 03 13
    // 20 30 21 31   22 32 23 33
    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
    // a02 a12 a22 a32   a03 a13 a23 a33
    // a00 a10 a20 a30   a01 a11 a21 a31
    // a03 a13 a23 a33   a02 a12 a22 a32
  }

  // First pass and subsequent transpose.
  {
    // Same operations are done on the (0,3) and (1,2) pairs.
    // b0 = (a0 + a3) << 3
    // b1 = (a1 + a2) << 3
    // b3 = (a0 - a3) << 3
    // b2 = (a1 - a2) << 3
    const __m128i a01 = _mm_add_epi16(v01, v32);
    const __m128i a32 = _mm_sub_epi16(v01, v32);
    const __m128i b01 = _mm_slli_epi16(a01, 3);
    const __m128i b32 = _mm_slli_epi16(a32, 3);
    const __m128i b11 = _mm_unpackhi_epi64(b01, b01);
    const __m128i b22 = _mm_unpackhi_epi64(b32, b32);

    // e0 = b0 + b1
    // e2 = b0 - b1
    const __m128i e0 = _mm_add_epi16(b01, b11);
    const __m128i e2 = _mm_sub_epi16(b01, b11);
    const __m128i e02 = _mm_unpacklo_epi64(e0, e2);

    // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12
    // e3 = (b3 * 2217 - b2 * 5352 +  7500) >> 12
    const __m128i b23 = _mm_unpacklo_epi16(b22, b32);
    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
    const __m128i d1 = _mm_add_epi32(c1, k14500);
    const __m128i d3 = _mm_add_epi32(c3, k7500);
    const __m128i e1 = _mm_srai_epi32(d1, 12);
    const __m128i e3 = _mm_srai_epi32(d3, 12);
    const __m128i e13 = _mm_packs_epi32(e1, e3);

    // Transpose.
    // 00 01 02 03  20 21 22 23
    // 10 11 12 13  30 31 32 33
    const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13);
    const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13);
    // 00 10 01 11   02 12 03 13
    // 20 30 21 31   22 32 23 33
    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
    // 02 12 22 32   03 13 23 33
    // 00 10 20 30   01 11 21 31
    // 03 13 23 33   02 12 22 32
  }

  // Second pass
  {
    // Same operations are done on the (0,3) and (1,2) pairs.
    // a0 = v0 + v3
    // a1 = v1 + v2
    // a3 = v0 - v3
    // a2 = v1 - v2
    const __m128i a01 = _mm_add_epi16(v01, v32);
    const __m128i a32 = _mm_sub_epi16(v01, v32);
    const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
    const __m128i a22 = _mm_unpackhi_epi64(a32, a32);

    // d0 = (a0 + a1 + 7) >> 4;
    // d2 = (a0 - a1 + 7) >> 4;
    const __m128i b0 = _mm_add_epi16(a01, a11);
    const __m128i b2 = _mm_sub_epi16(a01, a11);
    const __m128i c0 = _mm_add_epi16(b0, seven);
    const __m128i c2 = _mm_add_epi16(b2, seven);
    const __m128i d0 = _mm_srai_epi16(c0, 4);
    const __m128i d2 = _mm_srai_epi16(c2, 4);

    // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
    // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
    const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
    const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
    const __m128i d3 = _mm_add_epi32(c3, k51000);
    const __m128i e1 = _mm_srai_epi32(d1, 16);
    const __m128i e3 = _mm_srai_epi32(d3, 16);
    const __m128i f1 = _mm_packs_epi32(e1, e1);
    const __m128i f3 = _mm_packs_epi32(e3, e3);
    // f1 = f1 + (a3 != 0);
    // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
    // desired (0, 1), we add one earlier through k12000_plus_one.
    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));

    _mm_storel_epi64((__m128i*)&out[ 0], d0);
    _mm_storel_epi64((__m128i*)&out[ 4], g1);
    _mm_storel_epi64((__m128i*)&out[ 8], d2);
    _mm_storel_epi64((__m128i*)&out[12], f3);
  }
}
Beispiel #30
0
void BrushToolEdit::drawInner(const QPoint &pt, float strength)
{
    float fixedStrength = params.strength;
    strength *= fixedStrength;

    auto color = params.color;
    std::array<int, 3> colorParts = Terrain::expandColor(color);
    __m128 colorMM = _mm_setr_ps(colorParts[0], colorParts[1], colorParts[2], 0);

    SseRoundingModeScope roundingModeScope(_MM_ROUND_NEAREST);
    (void) roundingModeScope;

    switch (tool->type()) {
    case BrushType::Blur:
        drawBlur(pt, std::min(strength / 5.f, 4.f));
        break;
    case BrushType::Smoothen:
        drawSmoothen(pt, std::min(strength / 5.f, 4.f));
        break;
    case BrushType::Raise:
    case BrushType::Lower:
        if (tool->type() == BrushType::Lower) {
            fixedStrength = -fixedStrength;
            strength = -strength;
        }
        switch (params.pressureMode) {
        case BrushPressureMode::AirBrush:
            strength *= 3.f;
            drawRaiseLower(pt, [=](float &current, float before, float tip) {
                (void) before;
                current -= tip * strength;
            });
            break;
        case BrushPressureMode::Constant:
            if (tool->type() == BrushType::Lower) {
                drawRaiseLower(pt, [=](float &current, float before, float tip) {
                    current = Terrain::quantizeOne(std::max(current, before - tip * fixedStrength));
                });
            } else {
                drawRaiseLower(pt, [=](float &current, float before, float tip) {
                    current = Terrain::quantizeOne(std::min(current, before - tip * fixedStrength));
                });
            }
            break;
        case BrushPressureMode::Adjustable:
            drawRaiseLower(pt, [=](float &current, float before, float tip) {
                current = Terrain::quantizeOne(before - tip * strength);
            });
            break;
        }
        break;
    case BrushType::Paint:
        switch (params.pressureMode) {
        case BrushPressureMode::AirBrush:
            strength = 1.f - std::exp2(-strength);

            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {
                (void) before;

                // convert current color to FP32
                auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(&current)));
                currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128());
                currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128());
                auto currentMF = _mm_cvtepi32_ps(currentMM);

                auto factor = _mm_set1_ps(tip * strength);

                // blend
                auto diff = _mm_sub_ps(colorMM, currentMF);
                diff = _mm_mul_ps(diff, factor);
                currentMF = _mm_add_ps(currentMF, diff);

                // convert to RGB32
                currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128());
                currentMM = _mm_cvttps_epi32(currentMF);
                currentMM = _mm_packs_epi32(currentMM, currentMM);
                currentMM = _mm_packus_epi16(currentMM, currentMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(currentMM));
            });
            break;
        case BrushPressureMode::Constant:
            fixedStrength *= 0.01f;
            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {
                // convert current color to FP32
                auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(&current)));
                currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128());
                currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128());
                auto currentMF = _mm_cvtepi32_ps(currentMM);

                // convert before color to FP32
                auto beforeMM = _mm_setr_epi32(before, 0, 0, 0);
                beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128());
                beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128());
                auto beforeMF = _mm_cvtepi32_ps(beforeMM);
                // beforeMM = _mm_add_ps(beforeMM, globalDitherSampler.getM128());

                // use "before" image to which way of color change is possible, and
                // compute possible range of result color
                auto diff = _mm_sub_ps(colorMM, beforeMF);
                auto factor = _mm_set1_ps(tip * fixedStrength);
                auto adddiff = _mm_mul_ps(diff, factor);
                beforeMF = _mm_add_ps(beforeMF, adddiff);
                auto diffDir = _mm_cmpgt_ps(diff, _mm_setzero_ps());

                // compute output image
                auto out1 = _mm_max_ps(currentMF, beforeMF);
                auto out2 = _mm_min_ps(currentMF, beforeMF);
                currentMF = _mm_or_ps(_mm_and_ps(diffDir, out1), _mm_andnot_ps(diffDir, out2));

                // convert to RGB32
                currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128());
                currentMM = _mm_cvttps_epi32(currentMF);
                currentMM = _mm_packs_epi32(currentMM, currentMM);
                currentMM = _mm_packus_epi16(currentMM, currentMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(currentMM));
            });
            break;
        case BrushPressureMode::Adjustable:
            strength *= 0.01f;
            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {

                // convert before color to FP32
                auto beforeMM = _mm_setr_epi32(before, 0, 0, 0);
                beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128());
                beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128());
                auto beforeMF = _mm_cvtepi32_ps(beforeMM);

                // blend
                auto diff = _mm_sub_ps(colorMM, beforeMF);
                auto factor = _mm_set1_ps(tip * strength);
                diff = _mm_mul_ps(diff, factor);
                beforeMF = _mm_add_ps(beforeMF, diff);

                // convert to RGB32
                beforeMF = _mm_add_ps(beforeMF, globalDitherSampler.getM128());
                beforeMM = _mm_cvttps_epi32(beforeMF);
                beforeMM = _mm_packs_epi32(beforeMM, beforeMM);
                beforeMM = _mm_packus_epi16(beforeMM, beforeMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(beforeMM));
            });
            break;
        }
        break;
    }

}