inline void Cryptor::assistKey192(__m128i *tmp, __m128i *tmp2, __m128i *tmp3) { // Duplicate the 2nd 32-bit part 4 times: // [1, 2, 3, 4] -> [2, 2, 2, 2] __m128i tmp4; *tmp2 = _mm_shuffle_epi32(*tmp2, SHUFFLE4_32(1, 1, 1, 1)); tmp4 = _mm_slli_si128(*tmp, 0x4); *tmp = _mm_xor_si128(*tmp, tmp4); tmp4 = _mm_slli_si128(tmp4, 0x4); *tmp = _mm_xor_si128(*tmp, tmp4); tmp4 = _mm_slli_si128(tmp4, 0x4); *tmp = _mm_xor_si128(*tmp, tmp4); *tmp = _mm_xor_si128(*tmp, *tmp2); // Duplicate the 4th 32-bit part 4 times. *tmp2 = _mm_shuffle_epi32(*tmp, SHUFFLE4_32(3, 3, 3, 3)); tmp4 = _mm_slli_si128(*tmp3, 0x4); *tmp3 = _mm_xor_si128(*tmp3, tmp4); *tmp3 = _mm_xor_si128(*tmp3, *tmp2); }
static void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff) { #if defined(__SSE2__) { for(unsigned col = 0; col < 8; col++) { __m128i c = _mm_load_si128((__m128i *)&in_coeff[(col * 8)]); for(unsigned x = 0; x < 8; x++) { __m128i sum; __m128i m; int32 tmp[4] MDFN_ALIGN(16); m = _mm_load_si128((__m128i *)&IDCTMatrix[(x * 8)]); sum = _mm_madd_epi16(m, c); sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6))); sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (1 << 0) | (0 << 2))); //_mm_store_ss((float *)&tmp[0], (__m128)sum); _mm_store_si128((__m128i*)tmp, sum); if(sizeof(T) == 1) out_coeff[(col * 8) + x] = Mask9ClampS8((tmp[0] + 0x4000) >> 15); else out_coeff[(x * 8) + col] = (tmp[0] + 0x4000) >> 15; } } }
void vp9_add_constant_residual_32x32_sse2(const int16_t diff, uint8_t *dest, int stride) { uint8_t abs_diff; __m128i d; int i = 8; if (diff >= 0) { abs_diff = (diff > 255) ? 255 : diff; d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); } else { abs_diff = (diff < -255) ? 255 : -diff; d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); } do { // Prediction data. __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride)); __m128i p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16)); __m128i p2 = _mm_load_si128((const __m128i *)(dest + 1 * stride)); __m128i p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16)); __m128i p4 = _mm_load_si128((const __m128i *)(dest + 2 * stride)); __m128i p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride + 16)); __m128i p6 = _mm_load_si128((const __m128i *)(dest + 3 * stride)); __m128i p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride + 16)); // Clip diff value to [0, 255] range. Then, do addition or subtraction // according to its sign. if (diff >= 0) { p0 = _mm_adds_epu8(p0, d); p1 = _mm_adds_epu8(p1, d); p2 = _mm_adds_epu8(p2, d); p3 = _mm_adds_epu8(p3, d); p4 = _mm_adds_epu8(p4, d); p5 = _mm_adds_epu8(p5, d); p6 = _mm_adds_epu8(p6, d); p7 = _mm_adds_epu8(p7, d); } else { p0 = _mm_subs_epu8(p0, d); p1 = _mm_subs_epu8(p1, d); p2 = _mm_subs_epu8(p2, d); p3 = _mm_subs_epu8(p3, d); p4 = _mm_subs_epu8(p4, d); p5 = _mm_subs_epu8(p5, d); p6 = _mm_subs_epu8(p6, d); p7 = _mm_subs_epu8(p7, d); } // Store results _mm_store_si128((__m128i *)(dest + 0 * stride), p0); _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1); _mm_store_si128((__m128i *)(dest + 1 * stride), p2); _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3); _mm_store_si128((__m128i *)(dest + 2 * stride), p4); _mm_store_si128((__m128i *)(dest + 2 * stride + 16), p5); _mm_store_si128((__m128i *)(dest + 3 * stride), p6); _mm_store_si128((__m128i *)(dest + 3 * stride + 16), p7); dest += 4 * stride; } while (--i); }
void INIT_Htable(uint8_t Htbl[16*8], uint8_t *H) { int i; __m128i T, TMP0, TMP1, TMP2, TMP3, TMP4, POLY; POLY = _mm_setr_epi32(0x1,0,0,0xc2000000); T = _mm_loadu_si128(((__m128i*)H)); TMP0 = T; _mm_storeu_si128(&((__m128i*)Htbl)[0], T); for (i=1; i<8; i++) { TMP1 = _mm_clmulepi64_si128(T, TMP0, 0x00); TMP4 = _mm_clmulepi64_si128(T, TMP0, 0x11); TMP2 = _mm_clmulepi64_si128(T, TMP0, 0x10); TMP3 = _mm_clmulepi64_si128(T, TMP0, 0x01); TMP2 = _mm_xor_si128(TMP2, TMP3); TMP3 = _mm_slli_si128(TMP2, 8); TMP2 = _mm_srli_si128(TMP2, 8); TMP1 = _mm_xor_si128(TMP3, TMP1); TMP4 = _mm_xor_si128(TMP4, TMP2); TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10); TMP3 = _mm_shuffle_epi32(TMP1, 78); TMP1 = _mm_xor_si128(TMP3, TMP2); TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10); TMP3 = _mm_shuffle_epi32(TMP1, 78); TMP1 = _mm_xor_si128(TMP3, TMP2); T = _mm_xor_si128(TMP4, TMP1); _mm_storeu_si128(&((__m128i*)Htbl)[i], T); } }
OD_SIMD_INLINE __m128i od_mullo_epi32_sse2(__m128i a, int b1) { __m128i b = _mm_set1_epi32(b1); __m128i lo = _mm_mul_epu32(a, b); __m128i hi = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); return _mm_unpacklo_epi32(_mm_shuffle_epi32(lo, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(hi, _MM_SHUFFLE(0, 0, 2, 0))); }
void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) { __m128i T0 = _mm_unpacklo_epi64(B0, B1); __m128i T1 = _mm_unpacklo_epi64(B2, B3); __m128i T2 = _mm_unpackhi_epi64(B0, B1); __m128i T3 = _mm_unpackhi_epi64(B2, B3); T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0)); T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0)); T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0)); T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0)); T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0)); T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0)); T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0)); T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0)); T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0)); T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0)); T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0)); T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0)); B0 = _mm_unpacklo_epi32(T0, T1); B1 = _mm_unpackhi_epi32(T0, T1); B2 = _mm_unpacklo_epi32(T2, T3); B3 = _mm_unpackhi_epi32(T2, T3); }
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ static void shuffle2_sse2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 2; size_t j; int k; uint8_t* dest_for_jth_element; __m128i xmm0[2], xmm1[2]; for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { /* Fetch 16 elements (32 bytes) then transpose bytes, words and double words. */ for (k = 0; k < 2; k++) { xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i)))); xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8); xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); } /* Transpose quad words */ for (k = 0; k < 1; k++) { xmm1[k * 2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k + 1]); xmm1[k * 2 + 1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k + 1]); } /* Store the result vectors */ dest_for_jth_element = dest + j; for (k = 0; k < 2; k++) { _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm1[k]); } } }
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ static void shuffle2(uint8_t* dest, uint8_t* src, size_t size) { size_t i, j, k; size_t numof16belem; __m128i xmm0[2], xmm1[2]; numof16belem = size / (16*2); for (i = 0, j = 0; i < numof16belem; i++, j += 16*2) { /* Fetch and transpose bytes, words and double words in groups of 32 bytes */ for (k = 0; k < 2; k++) { xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16)); xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8); xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); } /* Transpose quad words */ for (k = 0; k < 1; k++) { xmm1[k*2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k+1]); xmm1[k*2+1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k+1]); } /* Store the result vectors */ for (k = 0; k < 2; k++) { ((__m128i *)dest)[k*numof16belem+i] = xmm1[k]; } } }
static inline void inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len, const gint16 * icoeff, gint bstride) { gint i; __m128i sum, t; sum = _mm_setzero_si128 (); for (i = 0; i < len; i += 16) { t = _mm_loadu_si128 ((__m128i *) (a + i)); sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i + 0)))); t = _mm_loadu_si128 ((__m128i *) (a + i + 8)); sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i + 8)))); } sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3))); sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1))); sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); sum = _mm_srai_epi32 (sum, PRECISION_S16); sum = _mm_packs_epi32 (sum, sum); *o = _mm_extract_epi16 (sum, 0); }
void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon, uint32_t out[], bool last) { __m128i key1 = *K1; __m128i key2 = *K2; key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1)); key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); key1 = _mm_xor_si128(key1, key2_with_rcon); *K1 = key1; _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1); if(last) return; key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4)); key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3))); *K2 = key2; out[4] = _mm_cvtsi128_si32(key2); out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4)); }
inline __m128i func_mul_epu32(__m128i a, __m128i b){ #if 1 // Multiply elements 0 and 2, and bput the 64 bit results into a vector. __m128i tmp02 = _mm_mul_epu32(a, b); // Shift the vectors by one word to the right, making 3->2, and 1->0, and // then multiply into double word vector. __m128i tmp13 = _mm_mul_epu32( _mm_srli_si128(a, 4), _mm_srli_si128(b,4)); // Shuffle the vectors to place the lower 32 bits of each results in the // lower two words. I have some concerns about endianness and portability // related to this function. __m128i tmpres02 = _mm_shuffle_epi32(tmp02, _MM_SHUFFLE(0,0,2,0)); __m128i tmpres13 = _mm_shuffle_epi32(tmp13, _MM_SHUFFLE(0,0,2,0)); // upcack the shuffled vectors into a return value return _mm_unpacklo_epi32(tmpres02, tmpres13); #else pvInt ret; int * p_a; int * p_b; p_a = (int *)&a; p_b = (int *)&b; for(int m=0; m < VEC_SIZE; m++){ ret.v[m] = p_a[m] * p_b[m]; } return ret.r; #endif }
//FINL int32 __ext_v_sum_int32(int32* x, int len) { __m128i msum = _mm_setzero_si128(); int32 ret; const int wlen = 4; for (int i = 0; i < len / wlen; i++) { __m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i)); msum = _mm_add_epi32(msum, mx); } __m128i mout = msum; msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3)); mout = _mm_add_epi32(mout, msum); msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3)); mout = _mm_add_epi32(mout, msum); msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3)); mout = _mm_add_epi32(mout, msum); ret = _mm_cvtsi128_si32 (mout); for (int i = (len / wlen) * wlen; i < len; i++) { ret += x[i]; } return ret; }
void Polyval_Horner(unsigned char* TAG, unsigned char* pH, unsigned char* inp, int length) { __m128i TMP0, TMP1, TMP2, TMP3, TMP4, T, POLY, H; int i=0; if (length==0) return; int has_semi = length%16; uint8_t B[16]={0}; length /=16; H = _mm_loadu_si128(((__m128i*)pH)); T = _mm_loadu_si128(((__m128i*)TAG)); POLY = _mm_setr_epi32(0x1,0,0,0xc2000000); for (i=0; i< length; i++) { T = _mm_xor_si128(T, _mm_loadu_si128(&((__m128i*)inp)[i])); TMP1 = _mm_clmulepi64_si128(T, H, 0x00); TMP4 = _mm_clmulepi64_si128(T, H, 0x11); TMP2 = _mm_clmulepi64_si128(T, H, 0x10); TMP3 = _mm_clmulepi64_si128(T, H, 0x01); TMP2 = _mm_xor_si128(TMP2, TMP3); TMP3 = _mm_slli_si128(TMP2, 8); TMP2 = _mm_srli_si128(TMP2, 8); TMP1 = _mm_xor_si128(TMP3, TMP1); TMP4 = _mm_xor_si128(TMP4, TMP2); TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10); TMP3 = _mm_shuffle_epi32(TMP1, 78); TMP1 = _mm_xor_si128(TMP3, TMP2); TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10); TMP3 = _mm_shuffle_epi32(TMP1, 78); TMP1 = _mm_xor_si128(TMP3, TMP2); T = _mm_xor_si128(TMP4, TMP1); } if (has_semi!=0) { memcpy(B, inp+length*16, has_semi); T = _mm_xor_si128(T, _mm_loadu_si128((__m128i*)B)); TMP1 = _mm_clmulepi64_si128(T, H, 0x00); TMP4 = _mm_clmulepi64_si128(T, H, 0x11); TMP2 = _mm_clmulepi64_si128(T, H, 0x10); TMP3 = _mm_clmulepi64_si128(T, H, 0x01); TMP2 = _mm_xor_si128(TMP2, TMP3); TMP3 = _mm_slli_si128(TMP2, 8); TMP2 = _mm_srli_si128(TMP2, 8); TMP1 = _mm_xor_si128(TMP3, TMP1); TMP4 = _mm_xor_si128(TMP4, TMP2); TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10); TMP3 = _mm_shuffle_epi32(TMP1, 78); TMP1 = _mm_xor_si128(TMP3, TMP2); TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10); TMP3 = _mm_shuffle_epi32(TMP1, 78); TMP1 = _mm_xor_si128(TMP3, TMP2); T = _mm_xor_si128(TMP4, TMP1); } _mm_storeu_si128(((__m128i*)TAG), T); }
static inline void inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len, const gint16 * icoeff, gint bstride) { gint i = 0; __m128i sum[4], t[4]; __m128i f = _mm_set_epi64x (0, *((long long *) icoeff)); const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride), (gint16 *) ((gint8 *) b + 1 * bstride), (gint16 *) ((gint8 *) b + 2 * bstride), (gint16 *) ((gint8 *) b + 3 * bstride) }; sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 (); f = _mm_unpacklo_epi16 (f, sum[0]); for (; i < len; i += 8) { t[0] = _mm_loadu_si128 ((__m128i *) (a + i)); sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[0] + i)))); sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[1] + i)))); sum[2] = _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[2] + i)))); sum[3] = _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[3] + i)))); } t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]); t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]); t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]); t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]); sum[0] = _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0], t[1])); sum[2] = _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2], t[3])); sum[0] = _mm_add_epi32 (sum[0], sum[2]); sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); sum[0] = _mm_madd_epi16 (sum[0], f); sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3))); sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1))); sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); sum[0] = _mm_packs_epi32 (sum[0], sum[0]); *o = _mm_extract_epi16 (sum[0], 0); }
static long conv_rgbaF_linear_rgbAF_linear (const float *src, float *dst, long samples) { long i = 0; long remainder; if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0) { const long n = (samples / 2) * 2; const __v4sf *s = (const __v4sf*) src; __v4sf *d = (__v4sf*)dst; for ( ; i < n; i += 2) { __v4sf rbaa0, rbaa1; __v4sf rgba0 = *s++; __v4sf rgba1 = *s++; /* Expand alpha */ __v4sf aaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(3, 3, 3, 3)); __v4sf aaaa1 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba1, _MM_SHUFFLE(3, 3, 3, 3)); /* Premultiply */ rgba0 = rgba0 * aaaa0; rgba1 = rgba1 * aaaa1; /* Shuffle the original alpha value back in */ rbaa0 = _mm_shuffle_ps(rgba0, aaaa0, _MM_SHUFFLE(0, 0, 2, 0)); rbaa1 = _mm_shuffle_ps(rgba1, aaaa1, _MM_SHUFFLE(0, 0, 2, 0)); rgba0 = _mm_shuffle_ps(rgba0, rbaa0, _MM_SHUFFLE(2, 1, 1, 0)); rgba1 = _mm_shuffle_ps(rgba1, rbaa1, _MM_SHUFFLE(2, 1, 1, 0)); *d++ = rgba0; *d++ = rgba1; } _mm_empty (); } dst += i * 4; src += i * 4; remainder = samples - i; while (remainder--) { const float a = src[3]; dst[0] = src[0] * a; dst[1] = src[1] * a; dst[2] = src[2] * a; dst[3] = a; src += 4; dst += 4; } return samples; }
void Convert444to420(LPBYTE input, int width, int pitch, int height, int startY, int endY, LPBYTE *output, bool bSSE2Available) { LPBYTE lumPlane = output[0]; LPBYTE uPlane = output[1]; LPBYTE vPlane = output[2]; int chrPitch = width>>1; if(bSSE2Available) { __m128i lumMask = _mm_set1_epi32(0x0000FF00); __m128i uvMask = _mm_set1_epi16(0x00FF); for(int y=startY; y<endY; y+=2) { int yPos = y*pitch; int chrYPos = ((y>>1)*chrPitch); int lumYPos = y*width; for(int x=0; x<width; x+=4) { LPBYTE lpImagePos = input+yPos+(x*4); int chrPos = chrYPos + (x>>1); int lumPos0 = lumYPos + x; int lumPos1 = lumPos0+width; __m128i line1 = _mm_load_si128((__m128i*)lpImagePos); __m128i line2 = _mm_load_si128((__m128i*)(lpImagePos+pitch)); //pack lum vals { __m128i packVal = _mm_packs_epi32(_mm_srli_si128(_mm_and_si128(line1, lumMask), 1), _mm_srli_si128(_mm_and_si128(line2, lumMask), 1)); packVal = _mm_packus_epi16(packVal, packVal); *(LPUINT)(lumPlane+lumPos0) = packVal.m128i_u32[0]; *(LPUINT)(lumPlane+lumPos1) = packVal.m128i_u32[1]; } //do average, pack UV vals { __m128i addVal = _mm_add_epi64(_mm_and_si128(line1, uvMask), _mm_and_si128(line2, uvMask)); __m128i avgVal = _mm_srai_epi16(_mm_add_epi64(addVal, _mm_shuffle_epi32(addVal, _MM_SHUFFLE(2, 3, 0, 1))), 2); avgVal = _mm_shuffle_epi32(avgVal, _MM_SHUFFLE(3, 1, 2, 0)); avgVal = _mm_shufflelo_epi16(avgVal, _MM_SHUFFLE(3, 1, 2, 0)); avgVal = _mm_packus_epi16(avgVal, avgVal); DWORD packedVals = avgVal.m128i_u32[0]; *(LPWORD)(uPlane+chrPos) = WORD(packedVals); *(LPWORD)(vPlane+chrPos) = WORD(packedVals>>16); } } } } else { #ifdef _WIN64 for(int y=startY; y<endY; y+=2)
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest, int stride) { uint8_t abs_diff; __m128i d; // Prediction data. __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride)); __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride)); __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride)); __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride)); __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride)); __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride)); __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride)); p0 = _mm_unpacklo_epi64(p0, p1); p2 = _mm_unpacklo_epi64(p2, p3); p4 = _mm_unpacklo_epi64(p4, p5); p6 = _mm_unpacklo_epi64(p6, p7); // Clip diff value to [0, 255] range. Then, do addition or subtraction // according to its sign. if (diff >= 0) { abs_diff = (diff > 255) ? 255 : diff; d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); p0 = _mm_adds_epu8(p0, d); p2 = _mm_adds_epu8(p2, d); p4 = _mm_adds_epu8(p4, d); p6 = _mm_adds_epu8(p6, d); } else { abs_diff = (diff < -255) ? 255 : -diff; d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); p0 = _mm_subs_epu8(p0, d); p2 = _mm_subs_epu8(p2, d); p4 = _mm_subs_epu8(p4, d); p6 = _mm_subs_epu8(p6, d); } _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0); p0 = _mm_srli_si128(p0, 8); _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0); _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2); p2 = _mm_srli_si128(p2, 8); _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2); _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4); p4 = _mm_srli_si128(p4, 8); _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4); _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6); p6 = _mm_srli_si128(p6, 8); _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6); }
DWORD WINAPI Autotune::AutotuneThread(void *i) //-------------------------------------------- { AutotuneThreadData &info = *static_cast<AutotuneThreadData *>(i); info.histogram.resize(HISTORY_BINS, 0); #ifdef ENABLE_SSE2 const bool useSSE = (GetProcSupport() & PROCSUPPORT_SSE2) != 0; #endif // Do autocorrelation and save results in a note histogram (restriced to one octave). for(int note = info.startNote, noteBin = note; note < info.endNote; note++, noteBin++) { if(noteBin >= HISTORY_BINS) { noteBin %= HISTORY_BINS; } const SmpLength autocorrShift = NoteToShift(info.sampleFreq, note, info.pitchReference); uint64 autocorrSum = 0; #ifdef ENABLE_SSE2 if(useSSE) { const __m128i *normalData = reinterpret_cast<const __m128i *>(info.sampleData); const __m128i *shiftedData = reinterpret_cast<const __m128i *>(info.sampleData + autocorrShift); for(SmpLength i = info.processLength / 8; i != 0; i--) { __m128i normal = _mm_loadu_si128(normalData++); __m128i shifted = _mm_loadu_si128(shiftedData++); __m128i diff = _mm_sub_epi16(normal, shifted); // 8 16-bit differences __m128i squares = _mm_madd_epi16(diff, diff); // Multiply and add: 4 32-bit squares __m128i sum1 = _mm_shuffle_epi32(squares, _MM_SHUFFLE(0, 1, 2, 3)); // Move upper two integers to lower __m128i sum2 = _mm_add_epi32(squares, sum1); // Now we can add the (originally) upper two and lower two integers __m128i sum3 = _mm_shuffle_epi32(sum2, _MM_SHUFFLE(1, 1, 1, 1)); // Move the second-lowest integer to lowest position __m128i sum4 = _mm_add_epi32(sum2, sum3); // Add the two lowest positions autocorrSum += _mm_cvtsi128_si32(sum4); } } else #endif { const int16 *normalData = info.sampleData; const int16 *shiftedData = info.sampleData + autocorrShift; // Add up squared differences of all values for(SmpLength i = info.processLength; i != 0; i--, normalData++, shiftedData++) { autocorrSum += (*normalData - *shiftedData) * (*normalData - *shiftedData); } } info.histogram[noteBin] += autocorrSum; } return 0; }
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, const uint8_t* src) { const int x_sub = wrk->x_sub; int accum = 0; const __m128i zero = _mm_setzero_si128(); const __m128i mult0 = _mm_set1_epi16(x_sub); const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale); const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); __m128i sum = zero; rescaler_t* frow = wrk->frow; const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width; if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) { WebPRescalerImportRowShrink_C(wrk, src); return; } assert(!WebPRescalerInputDone(wrk)); assert(!wrk->x_expand); for (; frow < frow_end; frow += 4) { __m128i base = zero; accum += wrk->x_add; while (accum > 0) { const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src)); src += 4; base = _mm_unpacklo_epi8(A, zero); // To avoid overflow, we need: base * x_add / x_sub < 32768 // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit. sum = _mm_add_epi16(sum, base); accum -= x_sub; } { // Emit next horizontal pixel. const __m128i mult = _mm_set1_epi16(-accum); const __m128i frac0 = _mm_mullo_epi16(base, mult); // 16b x 16b -> 32b const __m128i frac1 = _mm_mulhi_epu16(base, mult); const __m128i frac = _mm_unpacklo_epi16(frac0, frac1); // frac is 32b const __m128i A0 = _mm_mullo_epi16(sum, mult0); const __m128i A1 = _mm_mulhi_epu16(sum, mult0); const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // sum * x_sub const __m128i frow_out = _mm_sub_epi32(B0, frac); // sum * x_sub - frac const __m128i D0 = _mm_srli_epi64(frac, 32); const __m128i D1 = _mm_mul_epu32(frac, mult1); // 32b x 16b -> 64b const __m128i D2 = _mm_mul_epu32(D0, mult1); const __m128i E1 = _mm_add_epi64(D1, rounder); const __m128i E2 = _mm_add_epi64(D2, rounder); const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2)); const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2)); const __m128i G = _mm_unpacklo_epi32(F1, F2); sum = _mm_packs_epi32(G, zero); _mm_storeu_si128((__m128i*)frow, frow_out); } } assert(accum == 0); }
__SIMDi _SIMD_mul_epi32(__SIMDi a, __SIMDi b) { #ifdef USE_SSE __m128i tmp1 = _mm_mul_epu32(a,b); /* mul 2,0*/ __m128i tmp2 = _mm_mul_epu32( _mm_srli_si128(a,4), _mm_srli_si128(b,4)); /* mul 3,1 */ return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); /* shuffle results to [63..0] and pack */ #elif defined USE_AVX return _m256_mul_ps(a,b); #elif defined USE_IBM return vec_mul(a,b); #endif }
static inline void inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len, const gint16 * icoeff, gint bstride) { gint i = 0; __m128i sum[2], t; __m128i f = _mm_set_epi64x (0, *((gint64 *) icoeff)); const gint16 *c[2] = { (gint16 *) ((gint8 *) b + 0 * bstride), (gint16 *) ((gint8 *) b + 1 * bstride) }; sum[0] = sum[1] = _mm_setzero_si128 (); f = _mm_unpacklo_epi16 (f, sum[0]); for (; i < len; i += 16) { t = _mm_loadu_si128 ((__m128i *) (a + i + 0)); sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[0] + i + 0)))); sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[1] + i + 0)))); t = _mm_loadu_si128 ((__m128i *) (a + i + 8)); sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[0] + i + 8)))); sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[1] + i + 8)))); } sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16); sum[0] = _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0))); sum[1] = _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1))); sum[0] = _mm_add_epi32 (sum[0], sum[1]); sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3))); sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1))); sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); sum[0] = _mm_packs_epi32 (sum[0], sum[0]); *o = _mm_extract_epi16 (sum[0], 0); }
void gf2x_mul2(unsigned long * t, unsigned long const * s1, unsigned long const * s2) #endif #endif { typedef union { __v2di s; unsigned long x[2]; } __v2di_proxy; __v2di ss1, ss2, s1s, s2s; __v2di_proxy t00, tk; #ifndef BORROW __v2di_proxy t11; #endif ss1 = _mm_loadu_si128((__v2di *)s1); ss2 = _mm_loadu_si128((__v2di *)s2); t00.s = _mm_clmulepi64_si128(ss1, ss2, 0); #ifndef BORROW t11.s = _mm_clmulepi64_si128(ss1, ss2, 17); #endif s1s = _mm_shuffle_epi32(ss1, 78); ss1 ^= s1s; s2s = _mm_shuffle_epi32(ss2, 78); ss2 ^= s2s; tk.s = _mm_clmulepi64_si128(ss1, ss2, 0); #ifndef BORROW tk.s ^= t00.s ^ t11.s; #endif /* store result */ t[0] = t00.x[0]; #ifdef BORROW t[1] = t00.x[1] ^ tk.x[0] ^ t00.x[0] ^ c[0]; t[2] = c[0] ^ tk.x[1] ^ t00.x[1] ^ c[1]; t[3] = c[1]; #else t[1] = t00.x[1] ^ tk.x[0]; t[2] = t11.x[0] ^ tk.x[1]; t[3] = t11.x[1]; #endif #ifdef CARRY c[0] = t11.x[0]; c[1] = t11.x[1]; #endif }
void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst) { int n; for (n = 0; n < 32; n += 2) { const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]); const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]); const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2)); const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2)); const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); _mm_storel_epi64((__m128i*)dst, tmp3); dst += 4 * 2; } }
opus_int64 silk_inner_prod16_aligned_64_sse4_1( const opus_int16 *inVec1, /* I input vector 1 */ const opus_int16 *inVec2, /* I input vector 2 */ const opus_int len /* I vector lengths */ ) { opus_int i, dataSize8; opus_int64 sum; __m128i xmm_tempa; __m128i inVec1_76543210, acc1; __m128i inVec2_76543210, acc2; sum = 0; dataSize8 = len & ~7; acc1 = _mm_setzero_si128(); acc2 = _mm_setzero_si128(); for( i = 0; i < dataSize8; i += 8 ) { inVec1_76543210 = _mm_loadu_si128( (__m128i*)(&inVec1[i + 0] ) ); inVec2_76543210 = _mm_loadu_si128( (__m128i*)(&inVec2[i + 0] ) ); /* only when all 4 operands are -32768 (0x8000), this results in wrap around */ inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 ); xmm_tempa = _mm_cvtepi32_epi64( inVec1_76543210 ); /* equal shift right 8 bytes */ inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) ); inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 ); acc1 = _mm_add_epi64( acc1, xmm_tempa ); acc2 = _mm_add_epi64( acc2, inVec1_76543210 ); } acc1 = _mm_add_epi64( acc1, acc2 ); /* equal shift right 8 bytes */ acc2 = _mm_shuffle_epi32( acc1, _MM_SHUFFLE( 0, 0, 3, 2 ) ); acc1 = _mm_add_epi64( acc1, acc2 ); _mm_storel_epi64( (__m128i *)&sum, acc1 ); for( ; i < len; i++ ) { sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] ); } return sum; }
int searchSIMDTree(int32_t **tree, int *fanout, int levels, int32_t value) { int iLevel = 0; int lOffset = 0; int pOffset = 0; int32_t cmpmask = 0; int32_t eqmask = 0; __m128i key = _mm_cvtsi32_si128(value); key = _mm_shuffle_epi32(key, _MM_SHUFFLE(0,0,0,0)); while (iLevel < levels) { int f = fanout[iLevel]; pOffset = lOffset; lOffset *= f - 1; int iter = 0; int position = 0; while (iter < f/4) { __m128i delimiters = _mm_load_si128((__m128i const*)&tree[iLevel][lOffset + iter*4]); __m128i compare = _mm_cmpgt_epi32(key, delimiters); cmpmask = _mm_movemask_ps(_mm_castsi128_ps(compare)); cmpmask ^= 0x0F; if (cmpmask) { position = _bit_scan_forward(cmpmask); break; } iter++; } int offset = lOffset + iter*4 + position; lOffset = offset + pOffset; iLevel++; } return lOffset; }
__inline__ static void mul4(__m128i in, __m128i *out) { const __m128i shuf = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); const __m128i mask = _mm_set_epi32(135, 1, 1, 1); block intmp = _mm_shuffle_epi8(in, shuf); block tmp = _mm_srai_epi32(intmp, 31); tmp = _mm_and_si128(tmp, mask); tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 1, 0, 3)); *out = _mm_slli_epi32(intmp, 1); *out = _mm_xor_si128(*out, tmp); tmp = _mm_srai_epi32(*out, 31); tmp = _mm_and_si128(tmp, mask); tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 1, 0, 3)); *out = _mm_slli_epi32(*out, 1); *out = _mm_xor_si128(*out, tmp); *out = _mm_shuffle_epi8(*out, shuf); }
__m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2) { if(rcon) { input2 = _mm_xor_si128(_mm_alignr_epi8(_mm_setzero_si128(), *rcon, 15), input2); *rcon = _mm_alignr_epi8(*rcon, *rcon, 15); // next rcon input1 = _mm_shuffle_epi32(input1, 0xFF); // rotate input1 = _mm_alignr_epi8(input1, input1, 1); } __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128(input2, 4)); smeared = mm_xor3(smeared, _mm_slli_si128(smeared, 8), _mm_set1_epi8(0x5B)); __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input1), 4); input1 = _mm_and_si128(low_nibs, input1); __m128i t2 = _mm_shuffle_epi8(k_inv2, input1); input1 = _mm_xor_si128(input1, t); __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t)); __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1)); __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3)); __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4)); return mm_xor3(_mm_shuffle_epi8(sb1u, t5), _mm_shuffle_epi8(sb1t, t6), smeared); }
inline void KEY_192_ASSIST(__m128i* temp1, __m128i * temp2, __m128i * temp3) { __m128i temp4; *temp2 = _mm_shuffle_epi32 (*temp2, 0x55); temp4 = _mm_slli_si128 (*temp1, 0x4); *temp1 = _mm_xor_si128 (*temp1, temp4); temp4 = _mm_slli_si128 (temp4, 0x4); *temp1 = _mm_xor_si128 (*temp1, temp4); temp4 = _mm_slli_si128 (temp4, 0x4); *temp1 = _mm_xor_si128 (*temp1, temp4); *temp1 = _mm_xor_si128 (*temp1, *temp2); *temp2 = _mm_shuffle_epi32(*temp1, 0xff); temp4 = _mm_slli_si128 (*temp3, 0x4); *temp3 = _mm_xor_si128 (*temp3, temp4); *temp3 = _mm_xor_si128 (*temp3, *temp2); }
// This function is called in code/WiFi/receiver/downSample.blk //FINL int __ext_permutatew1313 (struct complex16* x, int __unused_2, struct complex16* y, int __unused_1) { assert (__unused_2 == 4); assert (__unused_1 == 4); //vcs *pi = (vcs *)x; //int tmp = 1; //vcs t1 = permutate<1,3,1,3>(pi[0]); //// tried to use this to avoid making a template, //// but the mm function also requires a constant expression as the second argument ////vcs t1 = (vcs)_mm_shuffle_epi32(pi[0], (((a3) << 6) | ((a2) << 4) | ((a1) << 2) | ((a0)))); ////int mvp; ////vcs t1 = (vcs)_mm_shuffle_epi32(pi[0], mvp); //vcs *po = (vcs *)y; //*po = t1; __m128i mx = _mm_loadu_si128((__m128i *)x ); _mm_storeu_si128((__m128i *) y, _mm_shuffle_epi32(mx, _MM_SHUFFLE(3, 1, 3, 1))); return 0; }
SIMDValue SIMDInt32x4Operation::OpMul(const SIMDValue& aValue, const SIMDValue& bValue) { SIMDValue result; X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); if (AutoSystemInfo::Data.SSE4_1Available()) { // a * b, only available in SSE4 x86Result.m128i_value = _mm_mullo_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value); result = X86SIMDValue::ToSIMDValue(x86Result); } else if (AutoSystemInfo::Data.SSE2Available()) { // mul 2,0: r0 = a0*b0; r1 = a2*b2 __m128i tmp1 = _mm_mul_epu32(tmpaValue.m128i_value, tmpbValue.m128i_value); // mul 3,1: r0=a1*b1; r1=a3*b3 __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(tmpaValue.m128i_value, 4), _mm_srli_si128(tmpbValue.m128i_value, 4)); // shuffle x86Results to [63..0] and pack x86Result.m128i_value = _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); result = X86SIMDValue::ToSIMDValue(x86Result); } else { result.i32[SIMD_X] = aValue.i32[SIMD_X] * bValue.i32[SIMD_X]; result.i32[SIMD_Y] = aValue.i32[SIMD_Y] * bValue.i32[SIMD_Y]; result.i32[SIMD_Z] = aValue.i32[SIMD_Z] * bValue.i32[SIMD_Z]; result.i32[SIMD_W] = aValue.i32[SIMD_W] * bValue.i32[SIMD_W]; } return result; }