inline __m128i func_mul_epu32(__m128i a, __m128i b){ #if 1 // Multiply elements 0 and 2, and bput the 64 bit results into a vector. __m128i tmp02 = _mm_mul_epu32(a, b); // Shift the vectors by one word to the right, making 3->2, and 1->0, and // then multiply into double word vector. __m128i tmp13 = _mm_mul_epu32( _mm_srli_si128(a, 4), _mm_srli_si128(b,4)); // Shuffle the vectors to place the lower 32 bits of each results in the // lower two words. I have some concerns about endianness and portability // related to this function. __m128i tmpres02 = _mm_shuffle_epi32(tmp02, _MM_SHUFFLE(0,0,2,0)); __m128i tmpres13 = _mm_shuffle_epi32(tmp13, _MM_SHUFFLE(0,0,2,0)); // upcack the shuffled vectors into a return value return _mm_unpacklo_epi32(tmpres02, tmpres13); #else pvInt ret; int * p_a; int * p_b; p_a = (int *)&a; p_b = (int *)&b; for(int m=0; m < VEC_SIZE; m++){ ret.v[m] = p_a[m] * p_b[m]; } return ret.r; #endif }
OD_SIMD_INLINE __m128i od_mullo_epi32_sse2(__m128i a, int b1) { __m128i b = _mm_set1_epi32(b1); __m128i lo = _mm_mul_epu32(a, b); __m128i hi = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); return _mm_unpacklo_epi32(_mm_shuffle_epi32(lo, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(hi, _MM_SHUFFLE(0, 0, 2, 0))); }
SIMDValue SIMDInt32x4Operation::OpMul(const SIMDValue& aValue, const SIMDValue& bValue) { SIMDValue result; X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); if (AutoSystemInfo::Data.SSE4_1Available()) { // a * b, only available in SSE4 x86Result.m128i_value = _mm_mullo_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value); result = X86SIMDValue::ToSIMDValue(x86Result); } else if (AutoSystemInfo::Data.SSE2Available()) { // mul 2,0: r0 = a0*b0; r1 = a2*b2 __m128i tmp1 = _mm_mul_epu32(tmpaValue.m128i_value, tmpbValue.m128i_value); // mul 3,1: r0=a1*b1; r1=a3*b3 __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(tmpaValue.m128i_value, 4), _mm_srli_si128(tmpbValue.m128i_value, 4)); // shuffle x86Results to [63..0] and pack x86Result.m128i_value = _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); result = X86SIMDValue::ToSIMDValue(x86Result); } else { result.i32[SIMD_X] = aValue.i32[SIMD_X] * bValue.i32[SIMD_X]; result.i32[SIMD_Y] = aValue.i32[SIMD_Y] * bValue.i32[SIMD_Y]; result.i32[SIMD_Z] = aValue.i32[SIMD_Z] * bValue.i32[SIMD_Z]; result.i32[SIMD_W] = aValue.i32[SIMD_W] * bValue.i32[SIMD_W]; } return result; }
inline __m128i Convert8DigitsSSE2(uint32_t value) { assert(value <= 99999999); // abcd, efgh = abcdefgh divmod 10000 const __m128i abcdefgh = _mm_cvtsi32_si128(value); const __m128i abcd = _mm_srli_epi64(_mm_mul_epu32(abcdefgh, reinterpret_cast<const __m128i*>(kDiv10000Vector)[0]), 45); const __m128i efgh = _mm_sub_epi32(abcdefgh, _mm_mul_epu32(abcd, reinterpret_cast<const __m128i*>(k10000Vector)[0])); // v1 = [ abcd, efgh, 0, 0, 0, 0, 0, 0 ] const __m128i v1 = _mm_unpacklo_epi16(abcd, efgh); // v1a = v1 * 4 = [ abcd * 4, efgh * 4, 0, 0, 0, 0, 0, 0 ] const __m128i v1a = _mm_slli_epi64(v1, 2); // v2 = [ abcd * 4, abcd * 4, abcd * 4, abcd * 4, efgh * 4, efgh * 4, efgh * 4, efgh * 4 ] const __m128i v2a = _mm_unpacklo_epi16(v1a, v1a); const __m128i v2 = _mm_unpacklo_epi32(v2a, v2a); // v4 = v2 div 10^3, 10^2, 10^1, 10^0 = [ a, ab, abc, abcd, e, ef, efg, efgh ] const __m128i v3 = _mm_mulhi_epu16(v2, reinterpret_cast<const __m128i*>(kDivPowersVector)[0]); const __m128i v4 = _mm_mulhi_epu16(v3, reinterpret_cast<const __m128i*>(kShiftPowersVector)[0]); // v5 = v4 * 10 = [ a0, ab0, abc0, abcd0, e0, ef0, efg0, efgh0 ] const __m128i v5 = _mm_mullo_epi16(v4, reinterpret_cast<const __m128i*>(k10Vector)[0]); // v6 = v5 << 16 = [ 0, a0, ab0, abc0, 0, e0, ef0, efg0 ] const __m128i v6 = _mm_slli_epi64(v5, 16); // v7 = v4 - v6 = { a, b, c, d, e, f, g, h } const __m128i v7 = _mm_sub_epi16(v4, v6); return v7; }
static WEBP_INLINE void ProcessRow_Floor_SSE2(const __m128i* const A0, const __m128i* const A1, const __m128i* const A2, const __m128i* const A3, const __m128i* const mult, uint8_t* const dst) { const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0); const __m128i B0 = _mm_mul_epu32(*A0, *mult); const __m128i B1 = _mm_mul_epu32(*A1, *mult); const __m128i B2 = _mm_mul_epu32(*A2, *mult); const __m128i B3 = _mm_mul_epu32(*A3, *mult); const __m128i D0 = _mm_srli_epi64(B0, WEBP_RESCALER_RFIX); const __m128i D1 = _mm_srli_epi64(B1, WEBP_RESCALER_RFIX); #if (WEBP_RESCALER_RFIX < 32) const __m128i D2 = _mm_and_si128(_mm_slli_epi64(B2, 32 - WEBP_RESCALER_RFIX), mask); const __m128i D3 = _mm_and_si128(_mm_slli_epi64(B3, 32 - WEBP_RESCALER_RFIX), mask); #else const __m128i D2 = _mm_and_si128(B2, mask); const __m128i D3 = _mm_and_si128(B3, mask); #endif const __m128i E0 = _mm_or_si128(D0, D2); const __m128i E1 = _mm_or_si128(D1, D3); const __m128i F = _mm_packs_epi32(E0, E1); const __m128i G = _mm_packus_epi16(F, F); _mm_storel_epi64((__m128i*)dst, G); }
static WEBP_INLINE void ProcessRow(const __m128i* const A0, const __m128i* const A1, const __m128i* const A2, const __m128i* const A3, const __m128i* const mult, uint8_t* const dst) { const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0); const __m128i B0 = _mm_mul_epu32(*A0, *mult); const __m128i B1 = _mm_mul_epu32(*A1, *mult); const __m128i B2 = _mm_mul_epu32(*A2, *mult); const __m128i B3 = _mm_mul_epu32(*A3, *mult); const __m128i C0 = _mm_add_epi64(B0, rounder); const __m128i C1 = _mm_add_epi64(B1, rounder); const __m128i C2 = _mm_add_epi64(B2, rounder); const __m128i C3 = _mm_add_epi64(B3, rounder); const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX); const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX); const __m128i D2 = _mm_and_si128(C2, mask); const __m128i D3 = _mm_and_si128(C3, mask); const __m128i E0 = _mm_or_si128(D0, D2); const __m128i E1 = _mm_or_si128(D1, D3); const __m128i F = _mm_packs_epi32(E0, E1); const __m128i G = _mm_packus_epi16(F, F); _mm_storel_epi64((__m128i*)dst, G); }
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, const uint8_t* src) { const int x_sub = wrk->x_sub; int accum = 0; const __m128i zero = _mm_setzero_si128(); const __m128i mult0 = _mm_set1_epi16(x_sub); const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale); const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); __m128i sum = zero; rescaler_t* frow = wrk->frow; const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width; if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) { WebPRescalerImportRowShrink_C(wrk, src); return; } assert(!WebPRescalerInputDone(wrk)); assert(!wrk->x_expand); for (; frow < frow_end; frow += 4) { __m128i base = zero; accum += wrk->x_add; while (accum > 0) { const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src)); src += 4; base = _mm_unpacklo_epi8(A, zero); // To avoid overflow, we need: base * x_add / x_sub < 32768 // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit. sum = _mm_add_epi16(sum, base); accum -= x_sub; } { // Emit next horizontal pixel. const __m128i mult = _mm_set1_epi16(-accum); const __m128i frac0 = _mm_mullo_epi16(base, mult); // 16b x 16b -> 32b const __m128i frac1 = _mm_mulhi_epu16(base, mult); const __m128i frac = _mm_unpacklo_epi16(frac0, frac1); // frac is 32b const __m128i A0 = _mm_mullo_epi16(sum, mult0); const __m128i A1 = _mm_mulhi_epu16(sum, mult0); const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // sum * x_sub const __m128i frow_out = _mm_sub_epi32(B0, frac); // sum * x_sub - frac const __m128i D0 = _mm_srli_epi64(frac, 32); const __m128i D1 = _mm_mul_epu32(frac, mult1); // 32b x 16b -> 64b const __m128i D2 = _mm_mul_epu32(D0, mult1); const __m128i E1 = _mm_add_epi64(D1, rounder); const __m128i E2 = _mm_add_epi64(D2, rounder); const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2)); const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2)); const __m128i G = _mm_unpacklo_epi32(F1, F2); sum = _mm_packs_epi32(G, zero); _mm_storeu_si128((__m128i*)frow, frow_out); } } assert(accum == 0); }
__SIMDi _SIMD_mul_epi32(__SIMDi a, __SIMDi b) { #ifdef USE_SSE __m128i tmp1 = _mm_mul_epu32(a,b); /* mul 2,0*/ __m128i tmp2 = _mm_mul_epu32( _mm_srli_si128(a,4), _mm_srli_si128(b,4)); /* mul 3,1 */ return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); /* shuffle results to [63..0] and pack */ #elif defined USE_AVX return _m256_mul_ps(a,b); #elif defined USE_IBM return vec_mul(a,b); #endif }
inline int rand() { __m128i split; __m128i multi; __m128i adder; __m128i mmask; __m128i smask; __m128i store; DATA(multi)={0x000343FD,0x000043FD,0x000343FD,0x00010DCD}; DATA(adder)={0x00269EC3,0x009E9EC3,0x00D19EC3,0x00000001}; DATA(mmask)={0xFFFFFFFF,0x00000000,0xFFFFFFFF,0x00000000}; DATA(smask)={0x00007FFF,0x00007FFF,0x00007FFF,0x00007FFF}; #undef DATA adder = _mm_load_si128 ((__m128i*)data_adder); multi = _mm_load_si128 ((__m128i*)data_multi); mmask = _mm_load_si128 ((__m128i*)data_mmask); smask = _mm_load_si128 ((__m128i*)data_smask); split = _mm_shuffle_epi32( __ccaprice_stdlib_rseed, __CCAPRICE_STDLIB_RANDOM_SSE_SHUFFLE ); __ccaprice_stdlib_rseed = _mm_mul_epu32(__ccaprice_stdlib_rseed, multi); multi = _mm_shuffle_epi32( multi, __CCAPRICE_STDLIB_RANDOM_SSE_SHUFFLE ); split = _mm_mul_epu32(split, multi); __ccaprice_stdlib_rseed = _mm_and_si128(__ccaprice_stdlib_rseed, mmask); split = _mm_and_si128(split, mmask); split = _mm_shuffle_epi32( split, __CCAPRICE_STDLIB_RANDOM_SSE_SHUFFLE ); __ccaprice_stdlib_rseed = _mm_or_si128 (__ccaprice_stdlib_rseed, split); __ccaprice_stdlib_rseed = _mm_add_epi32 (__ccaprice_stdlib_rseed, adder); store = _mm_srai_epi32(__ccaprice_stdlib_rseed, 0x10); store = _mm_and_si128 (store, smask); return (unsigned int)_mm_cvtsi128_si32(store); #undef __CCAPRICE_STDLIB_RANDOM_SSE_SHUFFLE #undef __CCAPRICE_STDLIB_RANDOM_SSE_STAIRS2 #undef __CCAPRICE_STDLIB_RANDOM_SSE_STAIRS1 }
__m128i test_mm_mul_epu32(__m128i A, __m128i B) { // DAG-LABEL: test_mm_mul_epu32 // DAG: call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) // // ASM-LABEL: test_mm_mul_epu32 // ASM: pmuludq return _mm_mul_epu32(A, B); }
// load *src as epi64, multiply by mult and store result in [out0 ... out3] static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src, const __m128i* const mult, __m128i* const out0, __m128i* const out1, __m128i* const out2, __m128i* const out3) { const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0)); const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4)); const __m128i A2 = _mm_srli_epi64(A0, 32); const __m128i A3 = _mm_srli_epi64(A1, 32); if (mult != NULL) { *out0 = _mm_mul_epu32(A0, *mult); *out1 = _mm_mul_epu32(A1, *mult); *out2 = _mm_mul_epu32(A2, *mult); *out3 = _mm_mul_epu32(A3, *mult); } else { *out0 = A0; *out1 = A1; *out2 = A2; *out3 = A3; } }
void rand_sse( unsigned int* result ) { __declspec( align(16) ) __m128i cur_seed_split; __declspec( align(16) ) __m128i multiplier; __declspec( align(16) ) __m128i adder; __declspec( align(16) ) __m128i mod_mask; __declspec( align(16) ) __m128i sra_mask; #ifdef RAND_SSE_COMPATABILITY __declspec( align(16) ) __m128i sseresult; #endif __declspec( align(16) ) static const unsigned int mult[4] ={ 214013, 17405, 214013, 69069 }; __declspec( align(16) ) static const unsigned int gadd[4] ={ 2531011, 10395331, 13737667, 1 }; __declspec( align(16) ) static const unsigned int mask[4] ={ 0xFFFFFFFF, 0, 0xFFFFFFFF, 0 }; __declspec( align(16) ) static const unsigned int masklo[4] ={ 0x00007FFF, 0x00007FFF, 0x00007FFF, 0x00007FFF }; adder = _mm_load_si128( (__m128i*) gadd); multiplier = _mm_load_si128( (__m128i*) mult); mod_mask = _mm_load_si128( (__m128i*) mask); sra_mask = _mm_load_si128( (__m128i*) masklo); cur_seed_split = _mm_shuffle_epi32( cur_seed, _MM_SHUFFLE( 2, 3, 0, 1 ) ); cur_seed = _mm_mul_epu32( cur_seed, multiplier ); multiplier = _mm_shuffle_epi32( multiplier, _MM_SHUFFLE( 2, 3, 0, 1 ) ); cur_seed_split = _mm_mul_epu32( cur_seed_split, multiplier ); cur_seed = _mm_and_si128( cur_seed, mod_mask); cur_seed_split = _mm_and_si128( cur_seed_split, mod_mask ); cur_seed_split = _mm_shuffle_epi32( cur_seed_split, _MM_SHUFFLE( 2, 3, 0, 1 ) ); cur_seed = _mm_or_si128( cur_seed, cur_seed_split ); cur_seed = _mm_add_epi32( cur_seed, adder); #ifdef RAND_SSE_COMPATABILITY // Add the lines below if you wish to reduce your results to 16-bit vals... sseresult = _mm_srai_epi32( cur_seed, 16); sseresult = _mm_and_si128( sseresult, sra_mask ); _mm_storeu_si128( (__m128i*) result, sseresult ); return; #endif _mm_storeu_si128( (__m128i*) result, cur_seed); return; }
/** * Return two random floats from 0 to 1 in positions 0 and 2. */ __m128 random_number2(){ mirand2 = _mm_mul_epu32(mirand2, _mm_set1_epi32(16807)); __m128i tmp; tmp = _mm_and_si128(mirand2, _mm_set1_epi32(0x007fffff)); tmp = _mm_or_si128(tmp, _mm_set1_epi32(0x3f800000)); __m128 ret = _mm_castsi128_ps(tmp); ret = _mm_sub_ps(ret, _mm_set1_ps(1)); return ret; }
MEMALIGN(16, __m128i cur_seed_split); MEMALIGN(16, __m128i multiplier); MEMALIGN(16, __m128i adder); MEMALIGN(16, __m128i mod_mask); MEMALIGN(16, __m128 res); MEMALIGN(16, static const unsigned int mult [4]) = {214013, 17405, 214013, 69069}; MEMALIGN(16, static const unsigned int gadd [4]) = {2531011, 10395331, 13737667, 1}; MEMALIGN(16, static const unsigned int mask [4]) = {0xFFFFFFFF, 0, 0xFFFFFFFF, 0}; adder = _mm_load_si128((__m128i*)gadd); multiplier = _mm_load_si128((__m128i*)mult); mod_mask = _mm_load_si128((__m128i*)mask); cur_seed_split = _mm_shuffle_epi32(m_sseSeed, _MM_SHUFFLE(2, 3, 0, 1)); m_sseSeed = _mm_mul_epu32(m_sseSeed, multiplier); multiplier = _mm_shuffle_epi32(multiplier, _MM_SHUFFLE(2, 3, 0, 1)); cur_seed_split = _mm_mul_epu32(cur_seed_split, multiplier); m_sseSeed = _mm_and_si128(m_sseSeed, mod_mask); cur_seed_split = _mm_and_si128(cur_seed_split, mod_mask); cur_seed_split = _mm_shuffle_epi32(cur_seed_split, _MM_SHUFFLE(2, 3, 0, 1)); m_sseSeed = _mm_or_si128(m_sseSeed, cur_seed_split); m_sseSeed = _mm_add_epi32(m_sseSeed, adder); /* adjust the value to the range requested */ res = _mm_cvtepi32_ps(m_sseSeed); if (sseresult) *sseresult = _mm_mul_ps(res, f); else {