Exemple #1
0
    inline void Cryptor::assistKey192(__m128i *tmp, __m128i *tmp2,
                                      __m128i *tmp3) {
      // Duplicate the 2nd 32-bit part 4 times:
      // [1, 2, 3, 4] -> [2, 2, 2, 2]      
      __m128i tmp4;
      *tmp2 = _mm_shuffle_epi32(*tmp2, SHUFFLE4_32(1, 1, 1, 1));
      
      tmp4 = _mm_slli_si128(*tmp, 0x4);
      *tmp = _mm_xor_si128(*tmp, tmp4);
      
      tmp4 = _mm_slli_si128(tmp4, 0x4);
      *tmp = _mm_xor_si128(*tmp, tmp4);
      
      tmp4 = _mm_slli_si128(tmp4, 0x4);
      *tmp = _mm_xor_si128(*tmp, tmp4);
      
      *tmp = _mm_xor_si128(*tmp, *tmp2);

      // Duplicate the 4th 32-bit part 4 times.
      *tmp2 = _mm_shuffle_epi32(*tmp, SHUFFLE4_32(3, 3, 3, 3));
      
      tmp4 = _mm_slli_si128(*tmp3, 0x4);
      *tmp3 = _mm_xor_si128(*tmp3, tmp4);
      
      *tmp3 = _mm_xor_si128(*tmp3, *tmp2);      
    }
Exemple #2
0
static void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
{
#if defined(__SSE2__)
{
 for(unsigned col = 0; col < 8; col++)
 {
  __m128i c =  _mm_load_si128((__m128i *)&in_coeff[(col * 8)]);

  for(unsigned x = 0; x < 8; x++)
  {
   __m128i sum;
   __m128i m;
   int32 tmp[4] MDFN_ALIGN(16);

   m = _mm_load_si128((__m128i *)&IDCTMatrix[(x * 8)]);
   sum = _mm_madd_epi16(m, c);
   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)));
   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (1 << 0) | (0 << 2)));

   //_mm_store_ss((float *)&tmp[0], (__m128)sum);
   _mm_store_si128((__m128i*)tmp, sum);

   if(sizeof(T) == 1)
    out_coeff[(col * 8) + x] = Mask9ClampS8((tmp[0] + 0x4000) >> 15);
   else
    out_coeff[(x * 8) + col] = (tmp[0] + 0x4000) >> 15;
  }
 }
}
void vp9_add_constant_residual_32x32_sse2(const int16_t diff, uint8_t *dest,
                                          int stride) {
  uint8_t abs_diff;
  __m128i d;
  int i = 8;

  if (diff >= 0) {
    abs_diff = (diff > 255) ? 255 : diff;
    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
  } else {
    abs_diff = (diff < -255) ? 255 : -diff;
    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
  }

  do {
    // Prediction data.
    __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
    __m128i p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16));
    __m128i p2 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
    __m128i p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16));
    __m128i p4 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
    __m128i p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride + 16));
    __m128i p6 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
    __m128i p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride + 16));

    // Clip diff value to [0, 255] range. Then, do addition or subtraction
    // according to its sign.
    if (diff >= 0) {
      p0 = _mm_adds_epu8(p0, d);
      p1 = _mm_adds_epu8(p1, d);
      p2 = _mm_adds_epu8(p2, d);
      p3 = _mm_adds_epu8(p3, d);
      p4 = _mm_adds_epu8(p4, d);
      p5 = _mm_adds_epu8(p5, d);
      p6 = _mm_adds_epu8(p6, d);
      p7 = _mm_adds_epu8(p7, d);
    } else {
      p0 = _mm_subs_epu8(p0, d);
      p1 = _mm_subs_epu8(p1, d);
      p2 = _mm_subs_epu8(p2, d);
      p3 = _mm_subs_epu8(p3, d);
      p4 = _mm_subs_epu8(p4, d);
      p5 = _mm_subs_epu8(p5, d);
      p6 = _mm_subs_epu8(p6, d);
      p7 = _mm_subs_epu8(p7, d);
    }

    // Store results
    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
    _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);
    _mm_store_si128((__m128i *)(dest + 1 * stride), p2);
    _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);
    _mm_store_si128((__m128i *)(dest + 2 * stride), p4);
    _mm_store_si128((__m128i *)(dest + 2 * stride + 16), p5);
    _mm_store_si128((__m128i *)(dest + 3 * stride), p6);
    _mm_store_si128((__m128i *)(dest + 3 * stride + 16), p7);

    dest += 4 * stride;
  } while (--i);
}
Exemple #4
0
void INIT_Htable(uint8_t Htbl[16*8], uint8_t *H)
{
	int i;
	__m128i T, TMP0, TMP1, TMP2, TMP3, TMP4, POLY;
	POLY = _mm_setr_epi32(0x1,0,0,0xc2000000);
	T = _mm_loadu_si128(((__m128i*)H));
	TMP0 = T;
	_mm_storeu_si128(&((__m128i*)Htbl)[0], T);
	for (i=1; i<8; i++)
	{
		TMP1 = _mm_clmulepi64_si128(T, TMP0, 0x00);
		TMP4 = _mm_clmulepi64_si128(T, TMP0, 0x11);
		TMP2 = _mm_clmulepi64_si128(T, TMP0, 0x10);
		TMP3 = _mm_clmulepi64_si128(T, TMP0, 0x01);
		TMP2 = _mm_xor_si128(TMP2, TMP3);
		TMP3 = _mm_slli_si128(TMP2, 8);
		TMP2 = _mm_srli_si128(TMP2, 8);
		TMP1 = _mm_xor_si128(TMP3, TMP1);
		TMP4 = _mm_xor_si128(TMP4, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		T = _mm_xor_si128(TMP4, TMP1);
		_mm_storeu_si128(&((__m128i*)Htbl)[i], T);
	}
}
Exemple #5
0
OD_SIMD_INLINE __m128i od_mullo_epi32_sse2(__m128i a, int b1) {
  __m128i b = _mm_set1_epi32(b1);
  __m128i lo = _mm_mul_epu32(a, b);
  __m128i hi = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
  return _mm_unpacklo_epi32(_mm_shuffle_epi32(lo, _MM_SHUFFLE(0, 0, 2, 0)),
   _mm_shuffle_epi32(hi, _MM_SHUFFLE(0, 0, 2, 0)));
}
Exemple #6
0
void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
   {
   __m128i T0 = _mm_unpacklo_epi64(B0, B1);
   __m128i T1 = _mm_unpacklo_epi64(B2, B3);
   __m128i T2 = _mm_unpackhi_epi64(B0, B1);
   __m128i T3 = _mm_unpackhi_epi64(B2, B3);

   T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
   T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
   T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
   T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));

   T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
   T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
   T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
   T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));

   T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
   T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
   T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
   T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));

   B0 = _mm_unpacklo_epi32(T0, T1);
   B1 = _mm_unpackhi_epi32(T0, T1);
   B2 = _mm_unpacklo_epi32(T2, T3);
   B3 = _mm_unpackhi_epi32(T2, T3);
   }
Exemple #7
0
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
static void
shuffle2_sse2(uint8_t* const dest, const uint8_t* const src,
              const size_t vectorizable_elements, const size_t total_elements) {
  static const size_t bytesoftype = 2;
  size_t j;
  int k;
  uint8_t* dest_for_jth_element;
  __m128i xmm0[2], xmm1[2];

  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
    /* Fetch 16 elements (32 bytes) then transpose bytes, words and double words. */
    for (k = 0; k < 2; k++) {
      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
      xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8);
      xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
      xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
    }
    /* Transpose quad words */
    for (k = 0; k < 1; k++) {
      xmm1[k * 2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k + 1]);
      xmm1[k * 2 + 1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k + 1]);
    }
    /* Store the result vectors */
    dest_for_jth_element = dest + j;
    for (k = 0; k < 2; k++) {
      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm1[k]);
    }
  }
}
Exemple #8
0
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
static void
shuffle2(uint8_t* dest, uint8_t* src, size_t size)
{
  size_t i, j, k;
  size_t numof16belem;
  __m128i xmm0[2], xmm1[2];

  numof16belem = size / (16*2);
  for (i = 0, j = 0; i < numof16belem; i++, j += 16*2) {
    /* Fetch and transpose bytes, words and double words in groups of
       32 bytes */
    for (k = 0; k < 2; k++) {
      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
      xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8);
      xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
      xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
    }
    /* Transpose quad words */
    for (k = 0; k < 1; k++) {
      xmm1[k*2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k+1]);
      xmm1[k*2+1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k+1]);
    }
    /* Store the result vectors */
    for (k = 0; k < 2; k++) {
      ((__m128i *)dest)[k*numof16belem+i] = xmm1[k];
    }
  }
}
static inline void
inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a,
    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
  gint i;
  __m128i sum, t;

  sum = _mm_setzero_si128 ();

  for (i = 0; i < len; i += 16) {
    t = _mm_loadu_si128 ((__m128i *) (a + i));
    sum =
        _mm_add_epi32 (sum, _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (b + i + 0))));

    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
    sum =
        _mm_add_epi32 (sum, _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (b + i + 8))));
  }
  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3)));
  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1)));

  sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
  sum = _mm_srai_epi32 (sum, PRECISION_S16);
  sum = _mm_packs_epi32 (sum, sum);
  *o = _mm_extract_epi16 (sum, 0);
}
Exemple #10
0
void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon,
                           uint32_t out[], bool last)
   {
   __m128i key1 = *K1;
   __m128i key2 = *K2;

   key2_with_rcon  = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1));
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
   key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
   key1 = _mm_xor_si128(key1, key2_with_rcon);

   *K1 = key1;
   _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1);

   if(last)
      return;

   key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
   key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3)));

   *K2 = key2;
   out[4] = _mm_cvtsi128_si32(key2);
   out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
   }
Exemple #11
0
inline __m128i 
func_mul_epu32(__m128i a, __m128i b){
#if 1
  // Multiply elements 0 and 2, and bput the 64 bit results into a vector.
  __m128i tmp02 = _mm_mul_epu32(a, b);
  // Shift the vectors by one word to the right, making 3->2, and 1->0, and 
  // then multiply into double word vector.
  __m128i tmp13 = _mm_mul_epu32( _mm_srli_si128(a, 4), _mm_srli_si128(b,4));
  // Shuffle the vectors to place the lower 32 bits of each results in the
  // lower two words. I have some concerns about endianness and portability
  // related to this function. 
  __m128i tmpres02 = _mm_shuffle_epi32(tmp02, _MM_SHUFFLE(0,0,2,0));
  __m128i tmpres13 = _mm_shuffle_epi32(tmp13, _MM_SHUFFLE(0,0,2,0));
  // upcack the shuffled vectors into a return value
  return _mm_unpacklo_epi32(tmpres02, tmpres13);
#else
  pvInt ret;
  int * p_a;
  int * p_b;
  p_a = (int *)&a;
  p_b = (int *)&b;
  for(int m=0; m < VEC_SIZE; m++){
    ret.v[m] = p_a[m] * p_b[m];
  }
  return ret.r;
#endif
    
}
Exemple #12
0
//FINL 
int32 __ext_v_sum_int32(int32* x, int len)
{
	__m128i msum = _mm_setzero_si128();
	int32 ret;
	const int wlen = 4;

	for (int i = 0; i < len / wlen; i++)
	{
		__m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i));
		msum = _mm_add_epi32(msum, mx);
	}

	__m128i mout = msum;
	
	msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3));
	mout = _mm_add_epi32(mout, msum);

	msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3));
	mout = _mm_add_epi32(mout, msum);

	msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3));
	mout = _mm_add_epi32(mout, msum);

	
	ret = _mm_cvtsi128_si32 (mout);
	
	for (int i = (len / wlen) * wlen; i < len; i++)
	{
		ret += x[i];
	}

	return ret;

}
Exemple #13
0
void Polyval_Horner(unsigned char* TAG,
					unsigned char* pH,
					unsigned char* inp,
					int length)
{
	__m128i TMP0, TMP1, TMP2, TMP3, TMP4, T, POLY, H;
	int i=0;
	if (length==0)
		return;
	int has_semi = length%16;
	uint8_t B[16]={0};
	length /=16;
	
	H = _mm_loadu_si128(((__m128i*)pH));
	T = _mm_loadu_si128(((__m128i*)TAG));
	POLY = _mm_setr_epi32(0x1,0,0,0xc2000000);
	for (i=0; i< length; i++)
	{
		T = _mm_xor_si128(T, _mm_loadu_si128(&((__m128i*)inp)[i]));
		TMP1 = _mm_clmulepi64_si128(T, H, 0x00);
		TMP4 = _mm_clmulepi64_si128(T, H, 0x11);
		TMP2 = _mm_clmulepi64_si128(T, H, 0x10);
		TMP3 = _mm_clmulepi64_si128(T, H, 0x01);
		TMP2 = _mm_xor_si128(TMP2, TMP3);
		TMP3 = _mm_slli_si128(TMP2, 8);
		TMP2 = _mm_srli_si128(TMP2, 8);
		TMP1 = _mm_xor_si128(TMP3, TMP1);
		TMP4 = _mm_xor_si128(TMP4, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		T = _mm_xor_si128(TMP4, TMP1);
	}
	if (has_semi!=0)
	{
		memcpy(B, inp+length*16, has_semi);
		T = _mm_xor_si128(T, _mm_loadu_si128((__m128i*)B));
		TMP1 = _mm_clmulepi64_si128(T, H, 0x00);
		TMP4 = _mm_clmulepi64_si128(T, H, 0x11);
		TMP2 = _mm_clmulepi64_si128(T, H, 0x10);
		TMP3 = _mm_clmulepi64_si128(T, H, 0x01);
		TMP2 = _mm_xor_si128(TMP2, TMP3);
		TMP3 = _mm_slli_si128(TMP2, 8);
		TMP2 = _mm_srli_si128(TMP2, 8);
		TMP1 = _mm_xor_si128(TMP3, TMP1);
		TMP4 = _mm_xor_si128(TMP4, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		T = _mm_xor_si128(TMP4, TMP1);
	}
	_mm_storeu_si128(((__m128i*)TAG), T);
}
static inline void
inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
  gint i = 0;
  __m128i sum[4], t[4];
  __m128i f = _mm_set_epi64x (0, *((long long *) icoeff));
  const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride),
    (gint16 *) ((gint8 *) b + 1 * bstride),
    (gint16 *) ((gint8 *) b + 2 * bstride),
    (gint16 *) ((gint8 *) b + 3 * bstride)
  };

  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
  f = _mm_unpacklo_epi16 (f, sum[0]);

  for (; i < len; i += 8) {
    t[0] = _mm_loadu_si128 ((__m128i *) (a + i));
    sum[0] =
        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[0] + i))));
    sum[1] =
        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[1] + i))));
    sum[2] =
        _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[2] + i))));
    sum[3] =
        _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[3] + i))));
  }
  t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]);
  t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]);
  t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]);
  t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]);

  sum[0] =
      _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0],
          t[1]));
  sum[2] =
      _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2],
          t[3]));
  sum[0] = _mm_add_epi32 (sum[0], sum[2]);

  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
  sum[0] = _mm_madd_epi16 (sum[0], f);

  sum[0] =
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
              3)));
  sum[0] =
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
              1)));

  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
  *o = _mm_extract_epi16 (sum[0], 0);
}
Exemple #15
0
static long
conv_rgbaF_linear_rgbAF_linear (const float *src, float *dst, long samples)
{
  long i = 0;
  long remainder;

  if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
    {
      const long    n = (samples / 2) * 2;
      const __v4sf *s = (const __v4sf*) src;
            __v4sf *d = (__v4sf*)dst;

      for ( ; i < n; i += 2)
        {
          __v4sf rbaa0, rbaa1;
        
          __v4sf rgba0 = *s++;
          __v4sf rgba1 = *s++;

          /* Expand alpha */
          __v4sf aaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(3, 3, 3, 3));
          __v4sf aaaa1 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba1, _MM_SHUFFLE(3, 3, 3, 3));
          
          /* Premultiply */
          rgba0 = rgba0 * aaaa0;
          rgba1 = rgba1 * aaaa1;
          
          /* Shuffle the original alpha value back in */
          rbaa0 = _mm_shuffle_ps(rgba0, aaaa0, _MM_SHUFFLE(0, 0, 2, 0));
          rbaa1 = _mm_shuffle_ps(rgba1, aaaa1, _MM_SHUFFLE(0, 0, 2, 0));
          
          rgba0 = _mm_shuffle_ps(rgba0, rbaa0, _MM_SHUFFLE(2, 1, 1, 0));
          rgba1 = _mm_shuffle_ps(rgba1, rbaa1, _MM_SHUFFLE(2, 1, 1, 0));
          
          *d++ = rgba0;
          *d++ = rgba1;
        }
      _mm_empty ();
    }

  dst += i * 4;
  src += i * 4;
  remainder = samples - i;
  while (remainder--)
  {
    const float a = src[3];
    dst[0] = src[0] * a;
    dst[1] = src[1] * a;
    dst[2] = src[2] * a;
    dst[3] = a;
    
    src += 4;
    dst += 4;
  }

  return samples;
}
Exemple #16
0
void Convert444to420(LPBYTE input, int width, int pitch, int height, int startY, int endY, LPBYTE *output, bool bSSE2Available)
{
    LPBYTE lumPlane     = output[0];
    LPBYTE uPlane       = output[1];
    LPBYTE vPlane       = output[2];
    int  chrPitch       = width>>1;

    if(bSSE2Available)
    {
        __m128i lumMask = _mm_set1_epi32(0x0000FF00);
        __m128i uvMask = _mm_set1_epi16(0x00FF);

        for(int y=startY; y<endY; y+=2)
        {
            int yPos    = y*pitch;
            int chrYPos = ((y>>1)*chrPitch);
            int lumYPos = y*width;

            for(int x=0; x<width; x+=4)
            {
                LPBYTE lpImagePos = input+yPos+(x*4);
                int chrPos  = chrYPos + (x>>1);
                int lumPos0 = lumYPos + x;
                int lumPos1 = lumPos0+width;

                __m128i line1 = _mm_load_si128((__m128i*)lpImagePos);
                __m128i line2 = _mm_load_si128((__m128i*)(lpImagePos+pitch));

                //pack lum vals
                {
                    __m128i packVal = _mm_packs_epi32(_mm_srli_si128(_mm_and_si128(line1, lumMask), 1), _mm_srli_si128(_mm_and_si128(line2, lumMask), 1));
                    packVal = _mm_packus_epi16(packVal, packVal);

                    *(LPUINT)(lumPlane+lumPos0) = packVal.m128i_u32[0];
                    *(LPUINT)(lumPlane+lumPos1) = packVal.m128i_u32[1];
                }

                //do average, pack UV vals
                {
                    __m128i addVal = _mm_add_epi64(_mm_and_si128(line1, uvMask), _mm_and_si128(line2, uvMask));
                    __m128i avgVal = _mm_srai_epi16(_mm_add_epi64(addVal, _mm_shuffle_epi32(addVal, _MM_SHUFFLE(2, 3, 0, 1))), 2);
                    avgVal = _mm_shuffle_epi32(avgVal, _MM_SHUFFLE(3, 1, 2, 0));
                    avgVal = _mm_shufflelo_epi16(avgVal, _MM_SHUFFLE(3, 1, 2, 0));
                    avgVal = _mm_packus_epi16(avgVal, avgVal);

                    DWORD packedVals = avgVal.m128i_u32[0];

                    *(LPWORD)(uPlane+chrPos) = WORD(packedVals);
                    *(LPWORD)(vPlane+chrPos) = WORD(packedVals>>16);
                }
            }
        }
    }
    else
    {
#ifdef _WIN64
        for(int y=startY; y<endY; y+=2)
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
                                        int stride) {
  uint8_t abs_diff;
  __m128i d;

  // Prediction data.
  __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
  __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
  __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
  __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
  __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride));
  __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride));
  __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride));
  __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride));

  p0 = _mm_unpacklo_epi64(p0, p1);
  p2 = _mm_unpacklo_epi64(p2, p3);
  p4 = _mm_unpacklo_epi64(p4, p5);
  p6 = _mm_unpacklo_epi64(p6, p7);

  // Clip diff value to [0, 255] range. Then, do addition or subtraction
  // according to its sign.
  if (diff >= 0) {
    abs_diff = (diff > 255) ? 255 : diff;
    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);

    p0 = _mm_adds_epu8(p0, d);
    p2 = _mm_adds_epu8(p2, d);
    p4 = _mm_adds_epu8(p4, d);
    p6 = _mm_adds_epu8(p6, d);
  } else {
    abs_diff = (diff < -255) ? 255 : -diff;
    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);

    p0 = _mm_subs_epu8(p0, d);
    p2 = _mm_subs_epu8(p2, d);
    p4 = _mm_subs_epu8(p4, d);
    p6 = _mm_subs_epu8(p6, d);
  }

  _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0);
  p0 = _mm_srli_si128(p0, 8);
  _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0);

  _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2);
  p2 = _mm_srli_si128(p2, 8);
  _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2);

  _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4);
  p4 = _mm_srli_si128(p4, 8);
  _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4);

  _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6);
  p6 = _mm_srli_si128(p6, 8);
  _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
}
Exemple #18
0
DWORD WINAPI Autotune::AutotuneThread(void *i)
//--------------------------------------------
{
	AutotuneThreadData &info = *static_cast<AutotuneThreadData *>(i);
	info.histogram.resize(HISTORY_BINS, 0);
#ifdef ENABLE_SSE2
	const bool useSSE = (GetProcSupport() & PROCSUPPORT_SSE2) != 0;
#endif

	// Do autocorrelation and save results in a note histogram (restriced to one octave).
	for(int note = info.startNote, noteBin = note; note < info.endNote; note++, noteBin++)
	{

		if(noteBin >= HISTORY_BINS)
		{
			noteBin %= HISTORY_BINS;
		}

		const SmpLength autocorrShift = NoteToShift(info.sampleFreq, note, info.pitchReference);

		uint64 autocorrSum = 0;

#ifdef ENABLE_SSE2
		if(useSSE)
		{
			const __m128i *normalData = reinterpret_cast<const __m128i *>(info.sampleData);
			const __m128i *shiftedData = reinterpret_cast<const __m128i *>(info.sampleData + autocorrShift);
			for(SmpLength i = info.processLength / 8; i != 0; i--)
			{
				__m128i normal = _mm_loadu_si128(normalData++);
				__m128i shifted = _mm_loadu_si128(shiftedData++);
				__m128i diff = _mm_sub_epi16(normal, shifted);		// 8 16-bit differences
				__m128i squares = _mm_madd_epi16(diff, diff);		// Multiply and add: 4 32-bit squares

				__m128i sum1 = _mm_shuffle_epi32(squares, _MM_SHUFFLE(0, 1, 2, 3));	// Move upper two integers to lower
				__m128i sum2  = _mm_add_epi32(squares, sum1);						// Now we can add the (originally) upper two and lower two integers
				__m128i sum3 = _mm_shuffle_epi32(sum2, _MM_SHUFFLE(1, 1, 1, 1));	// Move the second-lowest integer to lowest position
				__m128i sum4  = _mm_add_epi32(sum2, sum3);							// Add the two lowest positions
				autocorrSum += _mm_cvtsi128_si32(sum4);
			}
		} else
#endif
		{
			const int16 *normalData = info.sampleData;
			const int16 *shiftedData = info.sampleData + autocorrShift;
			// Add up squared differences of all values
			for(SmpLength i = info.processLength; i != 0; i--, normalData++, shiftedData++)
			{
				autocorrSum += (*normalData - *shiftedData) * (*normalData - *shiftedData);
			}
		}

		info.histogram[noteBin] += autocorrSum;
	}
	return 0;
}
Exemple #19
0
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
                                         const uint8_t* src) {
  const int x_sub = wrk->x_sub;
  int accum = 0;
  const __m128i zero = _mm_setzero_si128();
  const __m128i mult0 = _mm_set1_epi16(x_sub);
  const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale);
  const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
  __m128i sum = zero;
  rescaler_t* frow = wrk->frow;
  const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;

  if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) {
    WebPRescalerImportRowShrink_C(wrk, src);
    return;
  }
  assert(!WebPRescalerInputDone(wrk));
  assert(!wrk->x_expand);

  for (; frow < frow_end; frow += 4) {
    __m128i base = zero;
    accum += wrk->x_add;
    while (accum > 0) {
      const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src));
      src += 4;
      base = _mm_unpacklo_epi8(A, zero);
      // To avoid overflow, we need: base * x_add / x_sub < 32768
      // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit.
      sum = _mm_add_epi16(sum, base);
      accum -= x_sub;
    }
    {    // Emit next horizontal pixel.
      const __m128i mult = _mm_set1_epi16(-accum);
      const __m128i frac0 = _mm_mullo_epi16(base, mult);  // 16b x 16b -> 32b
      const __m128i frac1 = _mm_mulhi_epu16(base, mult);
      const __m128i frac = _mm_unpacklo_epi16(frac0, frac1);  // frac is 32b
      const __m128i A0 = _mm_mullo_epi16(sum, mult0);
      const __m128i A1 = _mm_mulhi_epu16(sum, mult0);
      const __m128i B0 = _mm_unpacklo_epi16(A0, A1);      // sum * x_sub
      const __m128i frow_out = _mm_sub_epi32(B0, frac);   // sum * x_sub - frac
      const __m128i D0 = _mm_srli_epi64(frac, 32);
      const __m128i D1 = _mm_mul_epu32(frac, mult1);      // 32b x 16b -> 64b
      const __m128i D2 = _mm_mul_epu32(D0, mult1);
      const __m128i E1 = _mm_add_epi64(D1, rounder);
      const __m128i E2 = _mm_add_epi64(D2, rounder);
      const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2));
      const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2));
      const __m128i G = _mm_unpacklo_epi32(F1, F2);
      sum = _mm_packs_epi32(G, zero);
      _mm_storeu_si128((__m128i*)frow, frow_out);
    }
  }
  assert(accum == 0);
}
Exemple #20
0
__SIMDi _SIMD_mul_epi32(__SIMDi a, __SIMDi b)
{
#ifdef  USE_SSE
    __m128i tmp1 = _mm_mul_epu32(a,b); /* mul 2,0*/
    __m128i tmp2 = _mm_mul_epu32( _mm_srli_si128(a,4), _mm_srli_si128(b,4)); /* mul 3,1 */
    return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); /* shuffle results to [63..0] and pack */
#elif defined USE_AVX
  return _m256_mul_ps(a,b);
#elif defined USE_IBM
  return vec_mul(a,b);
#endif
}
static inline void
inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a,
    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
  gint i = 0;
  __m128i sum[2], t;
  __m128i f = _mm_set_epi64x (0, *((gint64 *) icoeff));
  const gint16 *c[2] = { (gint16 *) ((gint8 *) b + 0 * bstride),
    (gint16 *) ((gint8 *) b + 1 * bstride)
  };

  sum[0] = sum[1] = _mm_setzero_si128 ();
  f = _mm_unpacklo_epi16 (f, sum[0]);

  for (; i < len; i += 16) {
    t = _mm_loadu_si128 ((__m128i *) (a + i + 0));
    sum[0] =
        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (c[0] + i + 0))));
    sum[1] =
        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (c[1] + i + 0))));

    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
    sum[0] =
        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (c[0] + i + 8))));
    sum[1] =
        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (c[1] + i + 8))));
  }
  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
  sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16);

  sum[0] =
      _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
  sum[1] =
      _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
  sum[0] = _mm_add_epi32 (sum[0], sum[1]);

  sum[0] =
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
              3)));
  sum[0] =
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
              1)));

  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
  *o = _mm_extract_epi16 (sum[0], 0);
}
Exemple #22
0
void gf2x_mul2(unsigned long * t, unsigned long const * s1,
        unsigned long const * s2)
#endif
#endif
{
    typedef union {
        __v2di s;
        unsigned long x[2];
    } __v2di_proxy;

    __v2di ss1, ss2, s1s, s2s;
    __v2di_proxy t00, tk;
#ifndef BORROW
    __v2di_proxy t11;
#endif
    ss1 = _mm_loadu_si128((__v2di *)s1);
    ss2 = _mm_loadu_si128((__v2di *)s2);


    t00.s = _mm_clmulepi64_si128(ss1, ss2, 0);
#ifndef BORROW
    t11.s = _mm_clmulepi64_si128(ss1, ss2, 17);
#endif

    s1s = _mm_shuffle_epi32(ss1, 78);
    ss1 ^= s1s;
    s2s = _mm_shuffle_epi32(ss2, 78);
    ss2 ^= s2s;

    tk.s = _mm_clmulepi64_si128(ss1, ss2, 0);

#ifndef BORROW
    tk.s ^= t00.s ^ t11.s;
#endif

    /* store result */
    t[0] = t00.x[0];
#ifdef BORROW
    t[1] = t00.x[1] ^ tk.x[0] ^ t00.x[0] ^ c[0];
    t[2] = c[0] ^ tk.x[1] ^ t00.x[1] ^ c[1];
    t[3] = c[1];
#else
    t[1] = t00.x[1] ^ tk.x[0];
    t[2] = t11.x[0] ^ tk.x[1];
    t[3] = t11.x[1];
#endif
#ifdef CARRY
    c[0] = t11.x[0];
    c[1] = t11.x[1];
#endif
}
Exemple #23
0
void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                    uint8_t* dst) {
  int n;
  for (n = 0; n < 32; n += 2) {
    const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
    const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
    _mm_storel_epi64((__m128i*)dst, tmp3);
    dst += 4 * 2;
  }
}
Exemple #24
0
opus_int64 silk_inner_prod16_aligned_64_sse4_1(
    const opus_int16            *inVec1,            /*    I input vector 1                                              */
    const opus_int16            *inVec2,            /*    I input vector 2                                              */
    const opus_int              len                 /*    I vector lengths                                              */
)
{
    opus_int  i, dataSize8;
    opus_int64 sum;

    __m128i xmm_tempa;
    __m128i inVec1_76543210, acc1;
    __m128i inVec2_76543210, acc2;

    sum = 0;
    dataSize8 = len & ~7;

    acc1 = _mm_setzero_si128();
    acc2 = _mm_setzero_si128();

    for( i = 0; i < dataSize8; i += 8 ) {
        inVec1_76543210 = _mm_loadu_si128( (__m128i*)(&inVec1[i + 0] ) );
        inVec2_76543210 = _mm_loadu_si128( (__m128i*)(&inVec2[i + 0] ) );

        /* only when all 4 operands are -32768 (0x8000), this results in wrap around */
        inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 );

        xmm_tempa       = _mm_cvtepi32_epi64( inVec1_76543210 );
        /* equal shift right 8 bytes */
        inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) );
        inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 );

        acc1 = _mm_add_epi64( acc1, xmm_tempa );
        acc2 = _mm_add_epi64( acc2, inVec1_76543210 );
    }

    acc1 = _mm_add_epi64( acc1, acc2 );

    /* equal shift right 8 bytes */
    acc2 = _mm_shuffle_epi32( acc1, _MM_SHUFFLE( 0, 0, 3, 2 ) );
    acc1 = _mm_add_epi64( acc1, acc2 );

    _mm_storel_epi64( (__m128i *)&sum, acc1 );

    for( ; i < len; i++ ) {
        sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
    }

    return sum;
}
int searchSIMDTree(int32_t **tree, int *fanout, int levels, int32_t value) {
    int iLevel = 0;
    int lOffset = 0;
    int pOffset = 0;
    int32_t cmpmask = 0;
    int32_t eqmask = 0;

     __m128i key = _mm_cvtsi32_si128(value);
    key = _mm_shuffle_epi32(key, _MM_SHUFFLE(0,0,0,0));

    while (iLevel < levels) {
        int f = fanout[iLevel];
        pOffset = lOffset;
        lOffset *= f - 1;
        int iter = 0;
        int position = 0;
        while (iter < f/4) {
            __m128i delimiters = _mm_load_si128((__m128i const*)&tree[iLevel][lOffset + iter*4]);
            __m128i compare = _mm_cmpgt_epi32(key, delimiters);
            cmpmask = _mm_movemask_ps(_mm_castsi128_ps(compare));
            cmpmask ^= 0x0F;
            if (cmpmask) {
                position = _bit_scan_forward(cmpmask);
                break;
            }
            iter++;
        }
        int offset = lOffset + iter*4 + position;
        lOffset = offset + pOffset;
        iLevel++;
    }
    return lOffset;
}
Exemple #26
0
__inline__ static void mul4(__m128i in, __m128i *out) {
	const __m128i shuf = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
	const __m128i mask = _mm_set_epi32(135, 1, 1, 1);
	block intmp = _mm_shuffle_epi8(in, shuf);
	block tmp = _mm_srai_epi32(intmp, 31);
	tmp = _mm_and_si128(tmp, mask);
	tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 1, 0, 3));
	*out = _mm_slli_epi32(intmp, 1);
	*out = _mm_xor_si128(*out, tmp);
	tmp = _mm_srai_epi32(*out, 31);
	tmp = _mm_and_si128(tmp, mask);
	tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 1, 0, 3));
	*out = _mm_slli_epi32(*out, 1);
	*out = _mm_xor_si128(*out, tmp);
	*out = _mm_shuffle_epi8(*out, shuf);
}
Exemple #27
0
__m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2)
   {
   if(rcon)
      {
      input2 = _mm_xor_si128(_mm_alignr_epi8(_mm_setzero_si128(), *rcon, 15),
                             input2);

      *rcon = _mm_alignr_epi8(*rcon, *rcon, 15); // next rcon

      input1 = _mm_shuffle_epi32(input1, 0xFF); // rotate
      input1 = _mm_alignr_epi8(input1, input1, 1);
      }

   __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128(input2, 4));
   smeared = mm_xor3(smeared, _mm_slli_si128(smeared, 8), _mm_set1_epi8(0x5B));

   __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input1), 4);

   input1 = _mm_and_si128(low_nibs, input1);

   __m128i t2 = _mm_shuffle_epi8(k_inv2, input1);

   input1 = _mm_xor_si128(input1, t);

   __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
   __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1));

   __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3));
   __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));

   return mm_xor3(_mm_shuffle_epi8(sb1u, t5),
                  _mm_shuffle_epi8(sb1t, t6),
                  smeared);
   }
Exemple #28
0
inline void KEY_192_ASSIST(__m128i* temp1, __m128i * temp2, __m128i * temp3) 
        { 
            __m128i temp4; 
            *temp2 = _mm_shuffle_epi32 (*temp2, 0x55);  
            temp4 = _mm_slli_si128 (*temp1, 0x4);  
            *temp1 = _mm_xor_si128 (*temp1, temp4); 
            temp4 = _mm_slli_si128 (temp4, 0x4);  
            *temp1 = _mm_xor_si128 (*temp1, temp4); 
            temp4 = _mm_slli_si128 (temp4, 0x4);  
            *temp1 = _mm_xor_si128 (*temp1, temp4); 
            *temp1 = _mm_xor_si128 (*temp1, *temp2); 
            *temp2 = _mm_shuffle_epi32(*temp1, 0xff); 
            temp4 = _mm_slli_si128 (*temp3, 0x4);  
            *temp3 = _mm_xor_si128 (*temp3, temp4); 
            *temp3 = _mm_xor_si128 (*temp3, *temp2); 
            } 
Exemple #29
0
// This function is called in code/WiFi/receiver/downSample.blk
//FINL 
int __ext_permutatew1313 (struct complex16* x,
               int __unused_2,  struct complex16* y, int __unused_1)
{
	assert (__unused_2 == 4);
	assert (__unused_1 == 4);


	//vcs *pi = (vcs *)x;
	//int tmp = 1;
	//vcs t1 = permutate<1,3,1,3>(pi[0]);
	//// tried to use this to avoid making a template, 
	//// but the mm function also requires a constant expression as the second argument
	////vcs t1 = (vcs)_mm_shuffle_epi32(pi[0], (((a3) << 6) | ((a2) << 4) | ((a1) << 2) | ((a0))));
	////int mvp;
	////vcs t1 = (vcs)_mm_shuffle_epi32(pi[0], mvp);
	//vcs *po = (vcs *)y;
	//*po = t1;


	__m128i mx = _mm_loadu_si128((__m128i *)x );
	_mm_storeu_si128((__m128i *) y, _mm_shuffle_epi32(mx, _MM_SHUFFLE(3, 1, 3, 1)));


	return 0;
}
    SIMDValue SIMDInt32x4Operation::OpMul(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        SIMDValue result;
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);

        if (AutoSystemInfo::Data.SSE4_1Available())
        {   // a * b, only available in SSE4
            x86Result.m128i_value = _mm_mullo_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value);
            result = X86SIMDValue::ToSIMDValue(x86Result);
        }
        else if (AutoSystemInfo::Data.SSE2Available())
        {
            // mul 2,0: r0 = a0*b0; r1 = a2*b2
            __m128i tmp1 = _mm_mul_epu32(tmpaValue.m128i_value, tmpbValue.m128i_value);
            // mul 3,1: r0=a1*b1; r1=a3*b3
            __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(tmpaValue.m128i_value, 4), _mm_srli_si128(tmpbValue.m128i_value, 4));
            // shuffle x86Results to [63..0] and pack
            x86Result.m128i_value = _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
            result = X86SIMDValue::ToSIMDValue(x86Result);
        }
        else
        {
            result.i32[SIMD_X] = aValue.i32[SIMD_X] * bValue.i32[SIMD_X];
            result.i32[SIMD_Y] = aValue.i32[SIMD_Y] * bValue.i32[SIMD_Y];
            result.i32[SIMD_Z] = aValue.i32[SIMD_Z] * bValue.i32[SIMD_Z];
            result.i32[SIMD_W] = aValue.i32[SIMD_W] * bValue.i32[SIMD_W];
        }

        return result;
    }