Beispiel #1
0
inline __m128i 
func_mul_epu32(__m128i a, __m128i b){
#if 1
  // Multiply elements 0 and 2, and bput the 64 bit results into a vector.
  __m128i tmp02 = _mm_mul_epu32(a, b);
  // Shift the vectors by one word to the right, making 3->2, and 1->0, and 
  // then multiply into double word vector.
  __m128i tmp13 = _mm_mul_epu32( _mm_srli_si128(a, 4), _mm_srli_si128(b,4));
  // Shuffle the vectors to place the lower 32 bits of each results in the
  // lower two words. I have some concerns about endianness and portability
  // related to this function. 
  __m128i tmpres02 = _mm_shuffle_epi32(tmp02, _MM_SHUFFLE(0,0,2,0));
  __m128i tmpres13 = _mm_shuffle_epi32(tmp13, _MM_SHUFFLE(0,0,2,0));
  // upcack the shuffled vectors into a return value
  return _mm_unpacklo_epi32(tmpres02, tmpres13);
#else
  pvInt ret;
  int * p_a;
  int * p_b;
  p_a = (int *)&a;
  p_b = (int *)&b;
  for(int m=0; m < VEC_SIZE; m++){
    ret.v[m] = p_a[m] * p_b[m];
  }
  return ret.r;
#endif
    
}
Beispiel #2
0
OD_SIMD_INLINE __m128i od_mullo_epi32_sse2(__m128i a, int b1) {
  __m128i b = _mm_set1_epi32(b1);
  __m128i lo = _mm_mul_epu32(a, b);
  __m128i hi = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
  return _mm_unpacklo_epi32(_mm_shuffle_epi32(lo, _MM_SHUFFLE(0, 0, 2, 0)),
   _mm_shuffle_epi32(hi, _MM_SHUFFLE(0, 0, 2, 0)));
}
    SIMDValue SIMDInt32x4Operation::OpMul(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        SIMDValue result;
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);

        if (AutoSystemInfo::Data.SSE4_1Available())
        {   // a * b, only available in SSE4
            x86Result.m128i_value = _mm_mullo_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value);
            result = X86SIMDValue::ToSIMDValue(x86Result);
        }
        else if (AutoSystemInfo::Data.SSE2Available())
        {
            // mul 2,0: r0 = a0*b0; r1 = a2*b2
            __m128i tmp1 = _mm_mul_epu32(tmpaValue.m128i_value, tmpbValue.m128i_value);
            // mul 3,1: r0=a1*b1; r1=a3*b3
            __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(tmpaValue.m128i_value, 4), _mm_srli_si128(tmpbValue.m128i_value, 4));
            // shuffle x86Results to [63..0] and pack
            x86Result.m128i_value = _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
            result = X86SIMDValue::ToSIMDValue(x86Result);
        }
        else
        {
            result.i32[SIMD_X] = aValue.i32[SIMD_X] * bValue.i32[SIMD_X];
            result.i32[SIMD_Y] = aValue.i32[SIMD_Y] * bValue.i32[SIMD_Y];
            result.i32[SIMD_Z] = aValue.i32[SIMD_Z] * bValue.i32[SIMD_Z];
            result.i32[SIMD_W] = aValue.i32[SIMD_W] * bValue.i32[SIMD_W];
        }

        return result;
    }
Beispiel #4
0
inline __m128i Convert8DigitsSSE2(uint32_t value) {
	assert(value <= 99999999);

	// abcd, efgh = abcdefgh divmod 10000 
	const __m128i abcdefgh = _mm_cvtsi32_si128(value);
	const __m128i abcd = _mm_srli_epi64(_mm_mul_epu32(abcdefgh, reinterpret_cast<const __m128i*>(kDiv10000Vector)[0]), 45);
	const __m128i efgh = _mm_sub_epi32(abcdefgh, _mm_mul_epu32(abcd, reinterpret_cast<const __m128i*>(k10000Vector)[0]));

	// v1 = [ abcd, efgh, 0, 0, 0, 0, 0, 0 ]
	const __m128i v1 = _mm_unpacklo_epi16(abcd, efgh);

	// v1a = v1 * 4 = [ abcd * 4, efgh * 4, 0, 0, 0, 0, 0, 0 ]
	const __m128i v1a = _mm_slli_epi64(v1, 2);

	// v2 = [ abcd * 4, abcd * 4, abcd * 4, abcd * 4, efgh * 4, efgh * 4, efgh * 4, efgh * 4 ]
	const __m128i v2a = _mm_unpacklo_epi16(v1a, v1a);
	const __m128i v2 = _mm_unpacklo_epi32(v2a, v2a);

	// v4 = v2 div 10^3, 10^2, 10^1, 10^0 = [ a, ab, abc, abcd, e, ef, efg, efgh ]
	const __m128i v3 = _mm_mulhi_epu16(v2, reinterpret_cast<const __m128i*>(kDivPowersVector)[0]);
	const __m128i v4 = _mm_mulhi_epu16(v3, reinterpret_cast<const __m128i*>(kShiftPowersVector)[0]);

	// v5 = v4 * 10 = [ a0, ab0, abc0, abcd0, e0, ef0, efg0, efgh0 ]
	const __m128i v5 = _mm_mullo_epi16(v4, reinterpret_cast<const __m128i*>(k10Vector)[0]);

	// v6 = v5 << 16 = [ 0, a0, ab0, abc0, 0, e0, ef0, efg0 ]
	const __m128i v6 = _mm_slli_epi64(v5, 16);

	// v7 = v4 - v6 = { a, b, c, d, e, f, g, h }
	const __m128i v7 = _mm_sub_epi16(v4, v6);

	return v7;
}
Beispiel #5
0
static WEBP_INLINE void ProcessRow_Floor_SSE2(const __m128i* const A0,
                                              const __m128i* const A1,
                                              const __m128i* const A2,
                                              const __m128i* const A3,
                                              const __m128i* const mult,
                                              uint8_t* const dst) {
  const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
  const __m128i B0 = _mm_mul_epu32(*A0, *mult);
  const __m128i B1 = _mm_mul_epu32(*A1, *mult);
  const __m128i B2 = _mm_mul_epu32(*A2, *mult);
  const __m128i B3 = _mm_mul_epu32(*A3, *mult);
  const __m128i D0 = _mm_srli_epi64(B0, WEBP_RESCALER_RFIX);
  const __m128i D1 = _mm_srli_epi64(B1, WEBP_RESCALER_RFIX);
#if (WEBP_RESCALER_RFIX < 32)
  const __m128i D2 =
      _mm_and_si128(_mm_slli_epi64(B2, 32 - WEBP_RESCALER_RFIX), mask);
  const __m128i D3 =
      _mm_and_si128(_mm_slli_epi64(B3, 32 - WEBP_RESCALER_RFIX), mask);
#else
  const __m128i D2 = _mm_and_si128(B2, mask);
  const __m128i D3 = _mm_and_si128(B3, mask);
#endif
  const __m128i E0 = _mm_or_si128(D0, D2);
  const __m128i E1 = _mm_or_si128(D1, D3);
  const __m128i F = _mm_packs_epi32(E0, E1);
  const __m128i G = _mm_packus_epi16(F, F);
  _mm_storel_epi64((__m128i*)dst, G);
}
Beispiel #6
0
static WEBP_INLINE void ProcessRow(const __m128i* const A0,
                                   const __m128i* const A1,
                                   const __m128i* const A2,
                                   const __m128i* const A3,
                                   const __m128i* const mult,
                                   uint8_t* const dst) {
  const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
  const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
  const __m128i B0 = _mm_mul_epu32(*A0, *mult);
  const __m128i B1 = _mm_mul_epu32(*A1, *mult);
  const __m128i B2 = _mm_mul_epu32(*A2, *mult);
  const __m128i B3 = _mm_mul_epu32(*A3, *mult);
  const __m128i C0 = _mm_add_epi64(B0, rounder);
  const __m128i C1 = _mm_add_epi64(B1, rounder);
  const __m128i C2 = _mm_add_epi64(B2, rounder);
  const __m128i C3 = _mm_add_epi64(B3, rounder);
  const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX);
  const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX);
  const __m128i D2 = _mm_and_si128(C2, mask);
  const __m128i D3 = _mm_and_si128(C3, mask);
  const __m128i E0 = _mm_or_si128(D0, D2);
  const __m128i E1 = _mm_or_si128(D1, D3);
  const __m128i F = _mm_packs_epi32(E0, E1);
  const __m128i G = _mm_packus_epi16(F, F);
  _mm_storel_epi64((__m128i*)dst, G);
}
Beispiel #7
0
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
                                         const uint8_t* src) {
  const int x_sub = wrk->x_sub;
  int accum = 0;
  const __m128i zero = _mm_setzero_si128();
  const __m128i mult0 = _mm_set1_epi16(x_sub);
  const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale);
  const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
  __m128i sum = zero;
  rescaler_t* frow = wrk->frow;
  const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;

  if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) {
    WebPRescalerImportRowShrink_C(wrk, src);
    return;
  }
  assert(!WebPRescalerInputDone(wrk));
  assert(!wrk->x_expand);

  for (; frow < frow_end; frow += 4) {
    __m128i base = zero;
    accum += wrk->x_add;
    while (accum > 0) {
      const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src));
      src += 4;
      base = _mm_unpacklo_epi8(A, zero);
      // To avoid overflow, we need: base * x_add / x_sub < 32768
      // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit.
      sum = _mm_add_epi16(sum, base);
      accum -= x_sub;
    }
    {    // Emit next horizontal pixel.
      const __m128i mult = _mm_set1_epi16(-accum);
      const __m128i frac0 = _mm_mullo_epi16(base, mult);  // 16b x 16b -> 32b
      const __m128i frac1 = _mm_mulhi_epu16(base, mult);
      const __m128i frac = _mm_unpacklo_epi16(frac0, frac1);  // frac is 32b
      const __m128i A0 = _mm_mullo_epi16(sum, mult0);
      const __m128i A1 = _mm_mulhi_epu16(sum, mult0);
      const __m128i B0 = _mm_unpacklo_epi16(A0, A1);      // sum * x_sub
      const __m128i frow_out = _mm_sub_epi32(B0, frac);   // sum * x_sub - frac
      const __m128i D0 = _mm_srli_epi64(frac, 32);
      const __m128i D1 = _mm_mul_epu32(frac, mult1);      // 32b x 16b -> 64b
      const __m128i D2 = _mm_mul_epu32(D0, mult1);
      const __m128i E1 = _mm_add_epi64(D1, rounder);
      const __m128i E2 = _mm_add_epi64(D2, rounder);
      const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2));
      const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2));
      const __m128i G = _mm_unpacklo_epi32(F1, F2);
      sum = _mm_packs_epi32(G, zero);
      _mm_storeu_si128((__m128i*)frow, frow_out);
    }
  }
  assert(accum == 0);
}
Beispiel #8
0
__SIMDi _SIMD_mul_epi32(__SIMDi a, __SIMDi b)
{
#ifdef  USE_SSE
    __m128i tmp1 = _mm_mul_epu32(a,b); /* mul 2,0*/
    __m128i tmp2 = _mm_mul_epu32( _mm_srli_si128(a,4), _mm_srli_si128(b,4)); /* mul 3,1 */
    return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); /* shuffle results to [63..0] and pack */
#elif defined USE_AVX
  return _m256_mul_ps(a,b);
#elif defined USE_IBM
  return vec_mul(a,b);
#endif
}
Beispiel #9
0
        inline int rand() {
            __m128i split;
            __m128i multi;
            __m128i adder;
            __m128i mmask;
            __m128i smask;
            __m128i store;

            DATA(multi)={0x000343FD,0x000043FD,0x000343FD,0x00010DCD};
            DATA(adder)={0x00269EC3,0x009E9EC3,0x00D19EC3,0x00000001};
            DATA(mmask)={0xFFFFFFFF,0x00000000,0xFFFFFFFF,0x00000000};
            DATA(smask)={0x00007FFF,0x00007FFF,0x00007FFF,0x00007FFF};
            #undef DATA

            adder = _mm_load_si128   ((__m128i*)data_adder);
            multi = _mm_load_si128   ((__m128i*)data_multi);
            mmask = _mm_load_si128   ((__m128i*)data_mmask);
            smask = _mm_load_si128   ((__m128i*)data_smask);

            split = _mm_shuffle_epi32(
                __ccaprice_stdlib_rseed,
                __CCAPRICE_STDLIB_RANDOM_SSE_SHUFFLE
            );

            __ccaprice_stdlib_rseed = _mm_mul_epu32(__ccaprice_stdlib_rseed, multi);
            multi                   = _mm_shuffle_epi32(
                multi,
                __CCAPRICE_STDLIB_RANDOM_SSE_SHUFFLE
            );
            split                   = _mm_mul_epu32(split, multi);
            __ccaprice_stdlib_rseed = _mm_and_si128(__ccaprice_stdlib_rseed, mmask);
            split                   = _mm_and_si128(split, mmask);
            split                   = _mm_shuffle_epi32(
                split,
                __CCAPRICE_STDLIB_RANDOM_SSE_SHUFFLE
            );
            __ccaprice_stdlib_rseed = _mm_or_si128  (__ccaprice_stdlib_rseed, split);
            __ccaprice_stdlib_rseed = _mm_add_epi32 (__ccaprice_stdlib_rseed, adder);
            store                   = _mm_srai_epi32(__ccaprice_stdlib_rseed, 0x10);
            store                   = _mm_and_si128 (store, smask);

            return (unsigned int)_mm_cvtsi128_si32(store);

            #undef __CCAPRICE_STDLIB_RANDOM_SSE_SHUFFLE
            #undef __CCAPRICE_STDLIB_RANDOM_SSE_STAIRS2
            #undef __CCAPRICE_STDLIB_RANDOM_SSE_STAIRS1
        }
Beispiel #10
0
__m128i test_mm_mul_epu32(__m128i A, __m128i B) {
  // DAG-LABEL: test_mm_mul_epu32
  // DAG: call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
  //
  // ASM-LABEL: test_mm_mul_epu32
  // ASM: pmuludq
  return _mm_mul_epu32(A, B);
}
Beispiel #11
0
// load *src as epi64, multiply by mult and store result in [out0 ... out3]
static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src,
                                            const __m128i* const mult,
                                            __m128i* const out0,
                                            __m128i* const out1,
                                            __m128i* const out2,
                                            __m128i* const out3) {
  const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0));
  const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4));
  const __m128i A2 = _mm_srli_epi64(A0, 32);
  const __m128i A3 = _mm_srli_epi64(A1, 32);
  if (mult != NULL) {
    *out0 = _mm_mul_epu32(A0, *mult);
    *out1 = _mm_mul_epu32(A1, *mult);
    *out2 = _mm_mul_epu32(A2, *mult);
    *out3 = _mm_mul_epu32(A3, *mult);
  } else {
    *out0 = A0;
    *out1 = A1;
    *out2 = A2;
    *out3 = A3;
  }
}
Beispiel #12
0
void rand_sse( unsigned int* result )
{
	__declspec( align(16) ) __m128i cur_seed_split;
	__declspec( align(16) ) __m128i multiplier;
	__declspec( align(16) ) __m128i adder;
	__declspec( align(16) ) __m128i mod_mask;
	__declspec( align(16) ) __m128i sra_mask;
#ifdef RAND_SSE_COMPATABILITY
	__declspec( align(16) ) __m128i sseresult;
#endif

	__declspec( align(16) ) static const unsigned int mult[4] ={ 214013, 17405, 214013, 69069 };
	__declspec( align(16) ) static const unsigned int gadd[4] ={ 2531011, 10395331, 13737667, 1 };
	__declspec( align(16) ) static const unsigned int mask[4] ={ 0xFFFFFFFF, 0, 0xFFFFFFFF, 0 };
	__declspec( align(16) ) static const unsigned int masklo[4] ={ 0x00007FFF, 0x00007FFF, 0x00007FFF, 0x00007FFF };

	adder = _mm_load_si128( (__m128i*) gadd);
	multiplier = _mm_load_si128( (__m128i*) mult);
	mod_mask = _mm_load_si128( (__m128i*) mask);
	sra_mask = _mm_load_si128( (__m128i*) masklo);
	cur_seed_split = _mm_shuffle_epi32( cur_seed, _MM_SHUFFLE( 2, 3, 0, 1 ) );
	cur_seed = _mm_mul_epu32( cur_seed, multiplier );
	multiplier = _mm_shuffle_epi32( multiplier, _MM_SHUFFLE( 2, 3, 0, 1 ) );
	cur_seed_split = _mm_mul_epu32( cur_seed_split, multiplier );
	cur_seed = _mm_and_si128( cur_seed, mod_mask);
	cur_seed_split = _mm_and_si128( cur_seed_split, mod_mask );
	cur_seed_split = _mm_shuffle_epi32( cur_seed_split, _MM_SHUFFLE( 2, 3, 0, 1 ) );
	cur_seed = _mm_or_si128( cur_seed, cur_seed_split );
	cur_seed = _mm_add_epi32( cur_seed, adder);

#ifdef RAND_SSE_COMPATABILITY // Add the lines below if you wish to reduce your results to 16-bit vals...
	sseresult = _mm_srai_epi32( cur_seed, 16);
	sseresult = _mm_and_si128( sseresult, sra_mask );
	_mm_storeu_si128( (__m128i*) result, sseresult );
	return;
#endif
	_mm_storeu_si128( (__m128i*) result, cur_seed);
	return;
}
Beispiel #13
0
/**
 * Return two random floats from 0 to 1 in positions 0 and 2.
 */
__m128 random_number2(){
	mirand2 = _mm_mul_epu32(mirand2, _mm_set1_epi32(16807));

	__m128i tmp;

	tmp = _mm_and_si128(mirand2, _mm_set1_epi32(0x007fffff));
	tmp = _mm_or_si128(tmp, _mm_set1_epi32(0x3f800000));

	__m128 ret = _mm_castsi128_ps(tmp);

	ret = _mm_sub_ps(ret, _mm_set1_ps(1));

	return ret;
}
Beispiel #14
0
    MEMALIGN(16, __m128i cur_seed_split);
    MEMALIGN(16, __m128i multiplier);
    MEMALIGN(16, __m128i adder);
    MEMALIGN(16, __m128i mod_mask);
    MEMALIGN(16, __m128 res);
    MEMALIGN(16, static const unsigned int mult  [4]) = {214013, 17405, 214013, 69069};
    MEMALIGN(16, static const unsigned int gadd  [4]) = {2531011, 10395331, 13737667, 1};
    MEMALIGN(16, static const unsigned int mask  [4]) = {0xFFFFFFFF, 0, 0xFFFFFFFF, 0};

    adder          = _mm_load_si128((__m128i*)gadd);
    multiplier     = _mm_load_si128((__m128i*)mult);
    mod_mask       = _mm_load_si128((__m128i*)mask);
    cur_seed_split = _mm_shuffle_epi32(m_sseSeed, _MM_SHUFFLE(2, 3, 0, 1));

    m_sseSeed      = _mm_mul_epu32(m_sseSeed, multiplier);
    multiplier     = _mm_shuffle_epi32(multiplier, _MM_SHUFFLE(2, 3, 0, 1));
    cur_seed_split = _mm_mul_epu32(cur_seed_split, multiplier);

    m_sseSeed      = _mm_and_si128(m_sseSeed, mod_mask);
    cur_seed_split = _mm_and_si128(cur_seed_split, mod_mask);
    cur_seed_split = _mm_shuffle_epi32(cur_seed_split, _MM_SHUFFLE(2, 3, 0, 1));
    m_sseSeed      = _mm_or_si128(m_sseSeed, cur_seed_split);
    m_sseSeed      = _mm_add_epi32(m_sseSeed, adder);

    /* adjust the value to the range requested */
    res = _mm_cvtepi32_ps(m_sseSeed);
    if (sseresult)
      *sseresult = _mm_mul_ps(res, f);
    else
    {