uint32_t probe(uint32_t key) { /* create a vector with all values initialized to key */ __m128i keyVector = _mm_set1_epi32(key); /* find the appropriate buckets using multiplicative hashing */ __m128i bucketIds = _mm_mullo_epi32(keyVector, hashes.vec128); bucketIds = _mm_srli_epi32(bucketIds, hashShift); size_t b0 = _mm_extract_epi32(bucketIds, 0); size_t b1 = _mm_extract_epi32(bucketIds, 1); __m128i keys; __m128i values0, values1; /* load keys, compare with lookup key (to produce a bitmask). * AND the result with the corresponding values. */ keys = _mm_load_si128((const __m128i *) buckets[b0].keys); keys = _mm_cmpeq_epi32(keys, keyVector); values0 = _mm_load_si128((const __m128i *) buckets[b0].values); values0 = _mm_and_si128(values0, keys); keys = _mm_load_si128((const __m128i *) buckets[b1].keys); keys = _mm_cmpeq_epi32(keys, keyVector); values1 = _mm_load_si128((const __m128i *) buckets[b1].values); values1 = _mm_and_si128(values1, keys); /* OR all of the (key AND value) pairs to get result */ union QuadInt qi; qi.vec128 = _mm_or_si128(values0, values1); qi.vec64[0] = _mm_or_si64(qi.vec64[0], qi.vec64[1]); return qi.arr[0] | qi.arr[1]; }
// There's no equivalent in libc, you'd think so ... std::mismatch exists, but it's not optimized at all. :( static inline size_t find_change(const uint16_t * a, const uint16_t * b) { const __m128i * a128=(const __m128i*)a; const __m128i * b128=(const __m128i*)b; while (true) { __m128i v0 = _mm_loadu_si128(a128); __m128i v1 = _mm_loadu_si128(b128); __m128i c = _mm_cmpeq_epi32(v0, v1); uint32_t mask = _mm_movemask_epi8(c); a128++; b128++; __m128i v0b = _mm_loadu_si128(a128); __m128i v1b = _mm_loadu_si128(b128); __m128i cb = _mm_cmpeq_epi32(v0b, v1b); uint32_t maskb = _mm_movemask_epi8(cb); if (mask != 0xffff || maskb != 0xffff) // Something has changed, figure out where. { if (mask == 0xffff) mask=maskb; else a128--;//ignore b128 since we'll return anyways size_t ret=(((char*)a128-(char*)a) | (compat_ctz(~mask))) >> 1; return (ret | (a[ret]==b[ret])); } a128++; b128++; } }
/*__forceinline*/ bool Cmp_ClutBuffer_GSMem<u32>(u32* GSmem, u32 csa, u32 clutsize) { u64* _GSmem = (u64*) GSmem; u64* clut = (u64*)GetClutBufferAddress<u32>(csa); while(clutsize > 0) { #ifdef ZEROGS_SSE2 // Note: local memory datas are swizzles __m128i GSmem_0 = _mm_load_si128((__m128i*)_GSmem); // 9 8 1 0 __m128i GSmem_1 = _mm_load_si128((__m128i*)_GSmem+1); // 11 10 3 2 __m128i GSmem_2 = _mm_load_si128((__m128i*)_GSmem+2); // 13 12 5 4 __m128i GSmem_3 = _mm_load_si128((__m128i*)_GSmem+3); // 15 14 7 6 __m128i clut_0 = _mm_load_si128((__m128i*)clut); __m128i clut_1 = _mm_load_si128((__m128i*)clut+1); __m128i clut_2 = _mm_load_si128((__m128i*)clut+2); __m128i clut_3 = _mm_load_si128((__m128i*)clut+3); __m128i result = _mm_cmpeq_epi32(_mm_unpacklo_epi64(GSmem_0, GSmem_1), clut_0); __m128i result_tmp = _mm_cmpeq_epi32(_mm_unpacklo_epi64(GSmem_2, GSmem_3), clut_1); result = _mm_and_si128(result, result_tmp); result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(GSmem_0, GSmem_1), clut_2); result = _mm_and_si128(result, result_tmp); result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(GSmem_2, GSmem_3), clut_3); result = _mm_and_si128(result, result_tmp); u32 result_int = _mm_movemask_epi8(result); if (result_int != 0xFFFF) return true; #else // I see no point to keep an mmx version. SSE2 versions is probably faster. // Keep a slow portable C version for reference/debug // Note: local memory datas are swizzles if (clut[0] != _GSmem[0] || clut[1] != _GSmem[2] || clut[2] != _GSmem[4] || clut[3] != _GSmem[6] || clut[4] != _GSmem[1] || clut[5] != _GSmem[3] || clut[6] != _GSmem[5] || clut[7] != _GSmem[7]) return true; #endif // go to the next memory block _GSmem += 32; // go back to the previous memory block then down one memory column if (clutsize & 0x40) { _GSmem -= (64-8); } // In case previous operation (down one column) cross the block boundary // Go to the next block if (clutsize == 0x240) { _GSmem += 32; } clut += 8; clutsize -= 64; } return false; }
QT_BEGIN_NAMESPACE bool convert_ARGB_to_ARGB_PM_inplace_sse2(QImageData *data, Qt::ImageConversionFlags) { Q_ASSERT(data->format == QImage::Format_ARGB32); // extra pixels on each line const int spare = data->width & 3; // width in pixels of the pad at the end of each line const int pad = (data->bytes_per_line >> 2) - data->width; const int iter = data->width >> 2; int height = data->height; const __m128i alphaMask = _mm_set1_epi32(0xff000000); const __m128i nullVector = _mm_setzero_si128(); const __m128i half = _mm_set1_epi16(0x80); const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); __m128i *d = reinterpret_cast<__m128i*>(data->data); while (height--) { const __m128i *end = d + iter; for (; d != end; ++d) { const __m128i srcVector = _mm_loadu_si128(d); const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { // opaque, data is unchanged } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) == 0xffff) { // fully transparent _mm_storeu_si128(d, nullVector); } else { __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); __m128i result; BYTE_MUL_SSE2(result, srcVector, alphaChannel, colorMask, half); result = _mm_or_si128(_mm_andnot_si128(alphaMask, result), srcVectorAlpha); _mm_storeu_si128(d, result); } } QRgb *p = reinterpret_cast<QRgb*>(d); QRgb *pe = p+spare; for (; p != pe; ++p) { if (*p < 0x00ffffff) *p = 0; else if (*p < 0xff000000) *p = PREMUL(*p); } d = reinterpret_cast<__m128i*>(p+pad); } data->format = QImage::Format_ARGB32_Premultiplied; return true; }
static int VectorMismatch_SSE2(const uint32_t* const array1, const uint32_t* const array2, int length) { int match_len; if (length >= 12) { __m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]); __m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]); match_len = 0; do { // Loop unrolling and early load both provide a speedup of 10% for the // current function. Also, max_limit can be MAX_LENGTH=4096 at most. const __m128i cmpA = _mm_cmpeq_epi32(A0, A1); const __m128i B0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); const __m128i B1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); if (_mm_movemask_epi8(cmpA) != 0xffff) break; match_len += 4; { const __m128i cmpB = _mm_cmpeq_epi32(B0, B1); A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); if (_mm_movemask_epi8(cmpB) != 0xffff) break; match_len += 4; } } while (match_len + 12 < length); } else { match_len = 0; // Unroll the potential first two loops. if (length >= 4 && _mm_movemask_epi8(_mm_cmpeq_epi32( _mm_loadu_si128((const __m128i*)&array1[0]), _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) { match_len = 4; if (length >= 8 && _mm_movemask_epi8(_mm_cmpeq_epi32( _mm_loadu_si128((const __m128i*)&array1[4]), _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) { match_len = 8; } } } while (match_len < length && array1[match_len] == array2[match_len]) { ++match_len; } return match_len; }
SIMDValue SIMDFloat32x4Operation::OpMaxNum(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); X86SIMDValue mask, mask2, t1, t2; // This is the correct result or b if either is NaN or both are +/-0.0 x86Result.m128_value = _mm_max_ps(tmpaValue.m128_value, tmpbValue.m128_value); // Find NaNs in b mask.m128_value = _mm_cmpunord_ps(tmpbValue.m128_value, tmpbValue.m128_value); // Find +0.0 in a mask2.m128i_value = _mm_cmpeq_epi32(tmpaValue.m128i_value, X86_ALL_ZEROS.m128i_value); // mask2 is -0.0 where a is +0.0 mask2.m128_value = _mm_and_ps(mask2.m128_value, X86_TWO_31_I4.m128_value); // For lanes where a is +0.0, the result is either correct (positive), or b which is possibly -0.0 // Safe to force sign to positive for those lanes, +0.0 becomes -0.0. x86Result.m128_value = _mm_andnot_ps(mask2.m128_value, x86Result.m128_value); // For NaNs in b, choose a, else keep result. t1.m128_value = _mm_and_ps(tmpaValue.m128_value, mask.m128_value); t2.m128_value = _mm_andnot_ps(mask.m128_value, x86Result.m128_value); x86Result.m128_value = _mm_or_ps(t1.m128_value, t2.m128_value); return X86SIMDValue::ToSIMDValue(x86Result); }
__m128i test_mm_cmpeq_epi32(__m128i A, __m128i B) { // DAG-LABEL: test_mm_cmpeq_epi32 // DAG: icmp eq <4 x i32> // // ASM-LABEL: test_mm_cmpeq_epi32 // ASM: pcmpeqd return _mm_cmpeq_epi32(A, B); }
bool SseBitcoinSha256::FindNonce(uint32_t& nonce) { #if UCFG_BITCOIN_ASM CSseData& sseData = SseData(); return CalcSha256Sse(sseData.m_4w, sseData.m_4midstate, m_midstate_after_3, UCFG_BITCOIN_NPAR, nonce); #else __m128i *m_4w1 = &m_4w[16*UCFG_BITCOIN_WAY]; //!!!? __m128i offset = _mm_set_epi32(3, 2, 1, 0); for (int i=0; i<UCFG_BITCOIN_NPAR; i+=UCFG_BITCOIN_WAY, nonce+=UCFG_BITCOIN_WAY) { m_4w[3] = _mm_set1_epi32(nonce)+offset; for (int j=0; j<8; ++j) m_4w1[j] = _mm_set1_epi32(m_midstate_after_3[j]); #if UCFG_BITCOIN_ASM CalcSha256Sse(m_4w, m_4midstate, m_4w1, 18, 3, 64); #else CalcRounds(m_4w, m_4midstate, m_4w1, 18, 3, 64); #endif #if UCFG_BITCOIN_WAY==6 || UCFG_BITCOIN_WAY==8 __m128i v[16]; for (int j=0; j<8; ++j) v[j*2] = v[j*2+1] = _mm_set1_epi32(g_sha256_hinit[j]); #else __m128i v[8]; for (int j=0; j<8; ++j) v[j] = _mm_set1_epi32(g_sha256_hinit[j]); #endif #if UCFG_BITCOIN_ASM __m128i e = CalcSha256Sse(m_4w1, v, v, 16, 0, 61); #else __m128i e = CalcRounds(m_4w1, v, v, 16, 0, 61); // We enough this #endif __m128i p = _mm_cmpeq_epi32(e + _mm_set1_epi32(g_sha256_hinit[7]), _mm_setzero_si128()); uint64_t *p64 = (uint64_t*)&p; if (p64[0] | p64[1]) { if (_mm_extract_epi16(p, 0) != 0) return true; if (_mm_extract_epi16(p, 2) != 0) { nonce += 1; return true; } if (_mm_extract_epi16(p, 4) != 0) { nonce += 2; return true; } if (_mm_extract_epi16(p, 6) != 0) { nonce += 3; return true; } } } return false; #endif }
SIMDValue SIMDInt32x4Operation::OpEqual(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_cmpeq_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a == b? return X86SIMDValue::ToSIMDValue(x86Result); }
unsigned int vp9_sad3x16_sse2( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride) { int r; __m128i s0, s1, s2, s3; __m128i r0, r1, r2, r3; __m128i sad = _mm_setzero_si128(); __m128i mask; const int offset = (uintptr_t)src_ptr & 3; /* In current use case, the offset is 1 if CONFIG_SUBPELREFMV is off. * Here, for offset=1, we adjust src_ptr to be 4-byte aligned. Then, movd * takes much less time. */ if (offset == 1) src_ptr -= 1; /* mask = 0xffffffffffff0000ffffffffffff0000 */ mask = _mm_cmpeq_epi32(sad, sad); mask = _mm_slli_epi64(mask, 16); for (r = 0; r < 16; r += 4) { s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride)); s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride)); s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride)); s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride)); r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * ref_stride)); r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * ref_stride)); r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * ref_stride)); r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * ref_stride)); s0 = _mm_unpacklo_epi8(s0, s1); r0 = _mm_unpacklo_epi8(r0, r1); s2 = _mm_unpacklo_epi8(s2, s3); r2 = _mm_unpacklo_epi8(r2, r3); s0 = _mm_unpacklo_epi64(s0, s2); r0 = _mm_unpacklo_epi64(r0, r2); // throw out extra byte if (offset == 1) s0 = _mm_and_si128(s0, mask); else s0 = _mm_slli_epi64(s0, 16); r0 = _mm_slli_epi64(r0, 16); sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0)); src_ptr += src_stride*4; ref_ptr += ref_stride*4; } sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8)); return _mm_cvtsi128_si32(sad); }
int SSEBinSearchBlock::search(uint32_t key) const { const __m128i keys = _mm_set1_epi32(key); __m128i v; int limit = data.size() - 1; int a = 0; int b = limit; while (a <= b) { const int c = (a + b)/2; if (data[c] == key) { return c; } if (key < data[c]) { b = c - 1; if (b >= 4) { v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&data[b - 4])); v = _mm_cmpeq_epi32(v, keys); const uint16_t mask = _mm_movemask_epi8(v); if (mask) { return b - 4 + __builtin_ctz(mask)/4; } } } else { a = c + 1; if (a + 4 < limit) { v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&data[a])); v = _mm_cmpeq_epi32(v, keys); const uint16_t mask = _mm_movemask_epi8(v); if (mask) { return a + __builtin_ctz(mask)/4; } } } } return -1; }
static inline __m128i _mm_setone_si128() { /* * Clang には _mm_undefined_si128 は無いらしい。 * Visual C++ では _mm_setzero_si128 に置き換えられる。 * どうせ処理の他の部分で all 0 を使うので、この処理でも問題なかろう。 */ __m128i x = _mm_setzero_si128(); // _mm_undefined_si128(); return _mm_cmpeq_epi32(x, x); }
__m64 _m_pcmpeqd(__m64 _MM1, __m64 _MM2) { __m128i lhs = {0}, rhs = {0}; lhs.m128i_i64[0] = _MM1.m64_i64; rhs.m128i_i64[0] = _MM2.m64_i64; lhs = _mm_cmpeq_epi32(lhs, rhs); _MM1.m64_i64 = lhs.m128i_i64[0]; return _MM1; }
/** * Convert a chroma-keyed image to standard ARGB32. * SSE2-optimized version. * * This operates on the image itself, and does not return * a duplicated image with the adjusted image. * * NOTE: The image *must* be ARGB32. * * @param key Chroma key color. * @return 0 on success; negative POSIX error code on error. */ int rp_image::apply_chroma_key_sse2(uint32_t key) { RP_D(rp_image); rp_image_backend *const backend = d->backend; assert(backend->format == FORMAT_ARGB32); if (backend->format != FORMAT_ARGB32) { // ARGB32 only. return -EINVAL; } const unsigned int diff = (backend->stride - this->row_bytes()) / sizeof(uint32_t); uint32_t *img_buf = static_cast<uint32_t*>(backend->data()); // SSE2 constants. const __m128i xmm_key = _mm_setr_epi32(key, key, key, key); const __m128i xmm_ones = _mm_setr_epi32(0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); for (unsigned int y = static_cast<unsigned int>(backend->height); y > 0; y--) { // Process 4 pixels per iteration with SSE2. unsigned int x = static_cast<unsigned int>(backend->width); for (; x > 3; x -= 4, img_buf += 4) { __m128i *xmm_data = reinterpret_cast<__m128i*>(img_buf); // Compare the pixels to the chroma key. // Equal values will be 0xFFFFFFFF. // Non-equal values will be 0x00000000. __m128i res = _mm_cmpeq_epi32(*xmm_data, xmm_key); // Invert the results and mask the original data. // Original data will now have 00s for chroma-keyed pixels. *xmm_data = _mm_and_si128(_mm_xor_si128(res, xmm_ones), *xmm_data); } // Remaining pixels. for (; x > 0; x--, img_buf++) { if (*img_buf == key) { *img_buf = 0; } } // Next row. img_buf += diff; } // Adjust sBIT. // TODO: Only if transparent pixels were found. if (d->has_sBIT && d->sBIT.alpha == 0) { d->sBIT.alpha = 1; } // Chroma key applied. return 0; }
SIMDValue SIMDUint32x4Operation::OpFromFloat32x4(const SIMDValue& value, bool& throws) { X86SIMDValue x86Result = { 0 }; X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); X86SIMDValue temp, temp2; X86SIMDValue two_31_f4, two_31_i4; int mask = 0; // any lanes < 0 ? temp.m128_value = _mm_cmplt_ps(v.m128_value, X86_ALL_ZEROS.m128_value); mask = _mm_movemask_ps(temp.m128_value); // negative value are out of range, caller should throw Range Error if (mask) { throws = true; return X86SIMDValue::ToSIMDValue(x86Result); } // CVTTPS2DQ does a range check over signed range [-2^31, 2^31-1], so will fail to convert values >= 2^31. // To fix this, subtract 2^31 from values >= 2^31, do CVTTPS2DQ, then add 2^31 back. _mm_store_ps(two_31_f4.simdValue.f32, X86_TWO_31_F4.m128_value); // any lanes >= 2^31 ? temp.m128_value = _mm_cmpge_ps(v.m128_value, two_31_f4.m128_value); // two_31_f4 has f32(2^31) for lanes >= 2^31, 0 otherwise two_31_f4.m128_value = _mm_and_ps(two_31_f4.m128_value, temp.m128_value); // subtract 2^31 from lanes >= 2^31, unchanged otherwise. v.m128_value = _mm_sub_ps(v.m128_value, two_31_f4.m128_value); // CVTTPS2DQ x86Result.m128i_value = _mm_cvttps_epi32(v.m128_value); // check if any value is out of range (i.e. >= 2^31, meaning originally >= 2^32 before value adjustment) temp2.m128i_value = _mm_cmpeq_epi32(x86Result.m128i_value, X86_NEG_MASK_F4.m128i_value); // any value == 0x80000000 ? mask = _mm_movemask_ps(temp2.m128_value); if (mask) { throws = true; return X86SIMDValue::ToSIMDValue(x86Result); } // we pass range check // add 2^31 values back to adjusted values. // Use first bit from the 2^31 float mask (0x4f000...0 << 1) // and result with 2^31 int mask (0x8000..0) setting first bit to zero if lane hasn't been adjusted _mm_store_ps(two_31_i4.simdValue.f32, X86_TWO_31_I4.m128_value); two_31_f4.m128i_value = _mm_slli_epi32(two_31_f4.m128i_value, 1); two_31_i4.m128i_value = _mm_and_si128(two_31_i4.m128i_value, two_31_f4.m128i_value); // add 2^31 back to adjusted values // Note at this point all values are in [0, 2^31-1]. Adding 2^31 is guaranteed not to overflow. x86Result.m128i_value = _mm_add_epi32(x86Result.m128i_value, two_31_i4.m128i_value); return X86SIMDValue::ToSIMDValue(x86Result); }
static bool equals(const __m128i& lhs, const __m128i& rhs) { char bytes[16]; __m128i result = _mm_cmpeq_epi32(lhs, rhs); _mm_storeu_si128(reinterpret_cast<__m128i*>(bytes), result); for (int offset = 0; offset < 16; ++offset) { if (bytes[offset] == 0) return false; } return true; }
static inline __m128i colorburn_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); // if (dc == da) __m128i cmp1 = _mm_cmpeq_epi32(dc, da); __m128i tmp1 = _mm_mullo_epi16(sa, da); __m128i tmp2 = _mm_mullo_epi16(sc, ida); __m128i tmp3 = _mm_mullo_epi16(dc, isa); __m128i rc1 = _mm_add_epi32(tmp1, tmp2); rc1 = _mm_add_epi32(rc1, tmp3); rc1 = clamp_div255round_SSE2(rc1); rc1 = _mm_and_si128(cmp1, rc1); // else if (0 == sc) __m128i cmp2 = _mm_cmpeq_epi32(sc, _mm_setzero_si128()); __m128i rc2 = SkAlphaMulAlpha_SSE2(dc, isa); __m128i cmp = _mm_andnot_si128(cmp1, cmp2); rc2 = _mm_and_si128(cmp, rc2); // else __m128i cmp3 = _mm_or_si128(cmp1, cmp2); __m128i tmp4 = _mm_sub_epi32(da, dc); tmp4 = Multiply32_SSE2(tmp4, sa); tmp4 = shim_mm_div_epi32(tmp4, sc); __m128i tmp5 = _mm_sub_epi32(da, SkMin32_SSE2(da, tmp4)); tmp5 = Multiply32_SSE2(sa, tmp5); __m128i rc3 = _mm_add_epi32(tmp5, tmp2); rc3 = _mm_add_epi32(rc3, tmp3); rc3 = clamp_div255round_SSE2(rc3); rc3 = _mm_andnot_si128(cmp3, rc3); __m128i rc = _mm_or_si128(rc1, rc2); rc = _mm_or_si128(rc, rc3); return rc; }
static float CombinedShannonEntropy(const int X[256], const int Y[256]) { int i; double retval = 0.; int sumX, sumXY; int32_t tmp[4]; __m128i zero = _mm_setzero_si128(); // Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY). __m128i sumXY_128 = zero; __m128i sumX_128 = zero; for (i = 0; i < 256; i += 4) { const __m128i x = _mm_loadu_si128((const __m128i*)(X + i)); const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i)); // Check if any X is non-zero: this actually provides a speedup as X is // usually sparse. if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) { const __m128i xy_128 = _mm_add_epi32(x, y); sumXY_128 = _mm_add_epi32(sumXY_128, xy_128); sumX_128 = _mm_add_epi32(sumX_128, x); // Analyze the different X + Y. _mm_storeu_si128((__m128i*)tmp, xy_128); ANALYZE_XY(0); ANALYZE_XY(1); ANALYZE_XY(2); ANALYZE_XY(3); } else { // X is fully 0, so only deal with Y. sumXY_128 = _mm_add_epi32(sumXY_128, y); ANALYZE_X_OR_Y(Y, 0); ANALYZE_X_OR_Y(Y, 1); ANALYZE_X_OR_Y(Y, 2); ANALYZE_X_OR_Y(Y, 3); } } // Sum up sumX_128 to get sumX. _mm_storeu_si128((__m128i*)tmp, sumX_128); sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0]; // Sum up sumXY_128 to get sumXY. _mm_storeu_si128((__m128i*)tmp, sumXY_128); sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0]; retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY); return (float)retval; }
static inline __m128i colordodge_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i diff = _mm_sub_epi32(sa, sc); __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); // if (0 == dc) __m128i cmp1 = _mm_cmpeq_epi32(dc, _mm_setzero_si128()); __m128i rc1 = _mm_and_si128(cmp1, SkAlphaMulAlpha_SSE2(sc, ida)); // else if (0 == diff) __m128i cmp2 = _mm_cmpeq_epi32(diff, _mm_setzero_si128()); __m128i cmp = _mm_andnot_si128(cmp1, cmp2); __m128i tmp1 = _mm_mullo_epi16(sa, da); __m128i tmp2 = _mm_mullo_epi16(sc, ida); __m128i tmp3 = _mm_mullo_epi16(dc, isa); __m128i rc2 = _mm_add_epi32(tmp1, tmp2); rc2 = _mm_add_epi32(rc2, tmp3); rc2 = clamp_div255round_SSE2(rc2); rc2 = _mm_and_si128(cmp, rc2); // else __m128i cmp3 = _mm_or_si128(cmp1, cmp2); __m128i value = _mm_mullo_epi16(dc, sa); diff = shim_mm_div_epi32(value, diff); __m128i tmp4 = SkMin32_SSE2(da, diff); tmp4 = Multiply32_SSE2(sa, tmp4); __m128i rc3 = _mm_add_epi32(tmp4, tmp2); rc3 = _mm_add_epi32(rc3, tmp3); rc3 = clamp_div255round_SSE2(rc3); rc3 = _mm_andnot_si128(cmp3, rc3); __m128i rc = _mm_or_si128(rc1, rc2); rc = _mm_or_si128(rc, rc3); return rc; }
static int cmp_all (void *binary, int count) { #ifdef _OPENMP int i; for (i = 0; i < count; i++) if (((uint32_t *) binary)[0] == crypt_key[0][i]) return 1; return 0; #else static const __m128i zero = {0}; __m128i tmp; __m128i bin; __m128i digest; digest = _mm_load_si128 ((__m128i *) crypt_key[0]); bin = _mm_set1_epi32 (((uint32_t *) binary)[0]); tmp = _mm_cmpeq_epi32 (bin, digest); return _mm_movemask_epi8 (_mm_cmpeq_epi32 (tmp, zero)) != 0xffff; #endif }
inline FORCE_INLINE __m128 mm_cvtph_ps(__m128i x) { __m128 magic = _mm_castsi128_ps(_mm_set1_epi32((uint32_t)113 << 23)); __m128i shift_exp = _mm_set1_epi32(0x7C00UL << 13); __m128i sign_mask = _mm_set1_epi32(0x8000U); __m128i mant_mask = _mm_set1_epi32(0x7FFF); __m128i exp_adjust = _mm_set1_epi32((127UL - 15UL) << 23); __m128i exp_adjust_nan = _mm_set1_epi32((127UL - 16UL) << 23); __m128i exp_adjust_denorm = _mm_set1_epi32(1UL << 23); __m128i zero = _mm_set1_epi16(0); __m128i exp, ret, ret_nan, ret_denorm, sign, mask0, mask1; x = _mm_unpacklo_epi16(x, zero); ret = _mm_and_si128(x, mant_mask); ret = _mm_slli_epi32(ret, 13); exp = _mm_and_si128(shift_exp, ret); ret = _mm_add_epi32(ret, exp_adjust); mask0 = _mm_cmpeq_epi32(exp, shift_exp); mask1 = _mm_cmpeq_epi32(exp, zero); ret_nan = _mm_add_epi32(ret, exp_adjust_nan); ret_denorm = _mm_add_epi32(ret, exp_adjust_denorm); ret_denorm = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(ret_denorm), magic)); sign = _mm_and_si128(x, sign_mask); sign = _mm_slli_epi32(sign, 16); ret = mm_blendv_ps(ret_nan, ret, mask0); ret = mm_blendv_ps(ret_denorm, ret, mask1); ret = _mm_or_si128(ret, sign); return _mm_castsi128_ps(ret); }
SIMDValue SIMDUint32x4Operation::OpLessThanOrEqual(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); X86SIMDValue signBits; signBits.m128i_value = _mm_set1_epi32(0x80000000); // Signed comparison of unsigned ints can be done if the ints have the "sign" bit xored with 1 tmpaValue.m128i_value = _mm_xor_si128(tmpaValue.m128i_value, signBits.m128i_value); tmpbValue.m128i_value = _mm_xor_si128(tmpbValue.m128i_value, signBits.m128i_value); x86Result.m128i_value = _mm_cmplt_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a < b? tmpaValue.m128i_value = _mm_cmpeq_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a == b? x86Result.m128i_value = _mm_or_si128(x86Result.m128i_value, tmpaValue.m128i_value); // result = (a<b)|(a==b) return X86SIMDValue::ToSIMDValue(x86Result); }
bool test(const index_t & kmer) const { __m128 __attribute__ ((aligned (16))) zero = _mm_setzero_si128(); const size_t BitsPerElement = sizeof(block_t) * 8; const Hash hashfunction = Hash(); kmer_t hashvalue = kmer; for (int hcount = this->h; hcount > 0; hcount--) { hashvalue = hashfunction(hashvalue); // we expect the compiler to automatically turn this into a shift because it's a const power of two size_t offset = (hashvalue % this->m) / BitsPerElement; if (_mm_movemask_epi8( _mm_cmpeq_epi32( _mm_and_ps(bitarray[offset],masks[hashvalue & (BitsPerElement-1)]),zero)) != 0xFFFF) return false; } return true; }
static void RENDER_StartLineHandler(const void * s) { if (s) { const Bitu *src = (Bitu*)s; Bitu *cache = (Bitu*)(render.scale.cacheRead); Bits count = render.src.start; #if defined(__SSE__) if(sse2_available) { #if defined (_MSC_VER) #define SIZEOF_INT_P 4 #endif static const Bitu simd_inc = 16/SIZEOF_INT_P; while (count >= (Bits)simd_inc) { __m128i v = _mm_loadu_si128((const __m128i*)src); __m128i c = _mm_loadu_si128((const __m128i*)cache); __m128i cmp = _mm_cmpeq_epi32(v, c); if (GCC_UNLIKELY(_mm_movemask_epi8(cmp) != 0xFFFF)) goto cacheMiss; count-=simd_inc; src+=simd_inc; cache+=simd_inc; } } #endif while (count) { if (GCC_UNLIKELY(src[0] != cache[0])) goto cacheMiss; count--; src++; cache++; } } /* cacheHit */ render.scale.cacheRead += render.scale.cachePitch; Scaler_ChangedLines[0] += Scaler_Aspect[ render.scale.inLine ]; render.scale.inLine++; render.scale.outLine++; return; cacheMiss: if (!GFX_StartUpdate( render.scale.outWrite, render.scale.outPitch )) { RENDER_DrawLine = RENDER_EmptyLineHandler; return; } render.scale.outWrite += render.scale.outPitch * Scaler_ChangedLines[0]; RENDER_DrawLine = render.scale.lineHandler; RENDER_DrawLine( s ); }
void HighPassFilter::setFlaggedValuesToZeroAndMakeWeightsSSE(const Image2DCPtr &inputImage, const Image2DPtr &outputImage, const Mask2DCPtr &inputMask, const Image2DPtr &weightsOutput) { const size_t width = inputImage->Width(); const __m128i zero4i = _mm_set_epi32(0, 0, 0, 0); const __m128 zero4 = _mm_set_ps(0.0, 0.0, 0.0, 0.0); const __m128 one4 = _mm_set_ps(1.0, 1.0, 1.0, 1.0); for(size_t y=0;y<inputImage->Height();++y) { const bool *rowPtr = inputMask->ValuePtr(0, y); const float *inputPtr = inputImage->ValuePtr(0, y); float *outputPtr = outputImage->ValuePtr(0, y); float *weightsPtr = weightsOutput->ValuePtr(0, y); const float *end = inputPtr + width; while(inputPtr < end) { // Assign each integer to one bool in the mask // Convert false to 0xFFFFFFFF and true to 0 __m128 conditionMask = _mm_castsi128_ps( _mm_cmpeq_epi32(_mm_set_epi32(rowPtr[3] || !isfinite(inputPtr[3]), rowPtr[2] || !isfinite(inputPtr[2]), rowPtr[1] || !isfinite(inputPtr[1]), rowPtr[0] || !isfinite(inputPtr[0])), zero4i)); _mm_store_ps(weightsPtr, _mm_or_ps( _mm_and_ps(conditionMask, one4), _mm_andnot_ps(conditionMask, zero4) )); _mm_store_ps(outputPtr, _mm_or_ps( _mm_and_ps(conditionMask, _mm_load_ps(inputPtr)), _mm_andnot_ps(conditionMask, zero4) )); rowPtr += 4; outputPtr += 4; inputPtr += 4; weightsPtr += 4; } } }
static INLINE size_t find_change(const uint16_t *a, const uint16_t *b) { const __m128i *a128 = (const __m128i*)a; const __m128i *b128 = (const __m128i*)b; for (;;) { __m128i v0 = _mm_loadu_si128(a128); __m128i v1 = _mm_loadu_si128(b128); __m128i c = _mm_cmpeq_epi32(v0, v1); uint32_t mask = _mm_movemask_epi8(c); if (mask != 0xffff) /* Something has changed, figure out where. */ { size_t ret = (((uint8_t*)a128 - (uint8_t*)a) | (compat_ctz(~mask))) >> 1; return ret | (a[ret] == b[ret]); } a128++; b128++; }
IplImage* calcMask(IplImage* image) { IplImage* mask = cvCloneImage(image); um128i *pMask = (um128i*)mask->imageData; um128i *pImg = (um128i*)image->imageData; um128i croma; um128i unos; unos.mm = _mm_set1_epi8(0xff); croma.mm = _mm_set1_epi32(*((unsigned int *)pImg)); for(int i = 0; i < image->imageSize; i += 16) { um128i px = *pImg++; px.mm = _mm_cmpeq_epi32(px.mm, croma.mm); px.mm = _mm_xor_si128(px.mm, unos.mm); *pMask++ = px; } return mask; }
inline FORCE_INLINE __m128i mm_cvtps_ph(__m128 x) { __m128 magic = _mm_castsi128_ps(_mm_set1_epi32((uint32_t)15 << 23)); __m128i inf = _mm_set1_epi32((uint32_t)255UL << 23); __m128i f16inf = _mm_set1_epi32((uint32_t)31UL << 23); __m128i sign_mask = _mm_set1_epi32(0x80000000UL); __m128i round_mask = _mm_set1_epi32(~0x0FFFU); __m128i ret_0x7E00 = _mm_set1_epi32(0x7E00); __m128i ret_0x7C00 = _mm_set1_epi32(0x7C00); __m128i f, sign, ge_inf, eq_inf; f = _mm_castps_si128(x); sign = _mm_and_si128(f, sign_mask); f = _mm_xor_si128(f, sign); ge_inf = _mm_cmpgt_epi32(f, inf); eq_inf = _mm_cmpeq_epi32(f, inf); f = _mm_and_si128(f, round_mask); f = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(f), magic)); f = _mm_sub_epi32(f, round_mask); f = mm_min_epi32(f, f16inf); f = _mm_srli_epi32(f, 13); f = mm_blendv_ps(ret_0x7E00, f, ge_inf); f = mm_blendv_ps(ret_0x7C00, f, eq_inf); sign = _mm_srli_epi32(sign, 16); f = _mm_or_si128(f, sign); f = mm_packus_epi32(f, _mm_setzero_si128()); return f; }
/***************************************************************************** * This function utilises 3 properties of the cost function lookup tables, * * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * * vp9_encoder.c. * * For the joint cost: * * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * * For the component costs: * * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * * (Equal costs for both components) * * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * * (Cost function is even) * * If these do not hold, then this function cannot be used without * * modification, in which case you can revert to using the C implementation, * * which does not rely on these properties. * *****************************************************************************/ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max); const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int); const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min); const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int); const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]); const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]); // search_param determines the length of the initial step and hence the number // of iterations. // 0 = initial step (MAX_FIRST_STEP) pel // 1 = (MAX_FIRST_STEP/2) pel, // 2 = (MAX_FIRST_STEP/4) pel... const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; const int tot_steps = cfg->total_steps - search_param; const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int); const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int); const int what_stride = x->plane[0].src.stride; const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; const uint8_t *const what = x->plane[0].src.buf; const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; // Work out the start point for the search const uint8_t *best_address = in_what; const uint8_t *new_best_address = best_address; #if ARCH_X86_64 __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif unsigned int best_sad; int i; int j; int step; // Check the prerequisite cost function properties that are easy to check // in an assert. See the function-level documentation for details on all // prerequisites. assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); // Check the starting position best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); *num00 = 0; for (i = 0, step = 0; step < tot_steps; step++) { for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { __m128i v_sad_d; __m128i v_cost_d; __m128i v_outside_d; __m128i v_inside_d; __m128i v_diff_mv_w; #if ARCH_X86_64 __m128i v_blocka[2]; #else __m128i v_blocka[1]; #endif // Compute the candidate motion vectors const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]); const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w); // Clamp them to the search bounds __m128i v_these_mv_clamp_w = v_these_mv_w; v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w); v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w); // The ones that did not change are inside the search area v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w); // If none of them are inside, then move on if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) { continue; } // The inverse mask indicates which of the MVs are outside v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff)); // Shift right to keep the sign bit clear, we will use this later // to set the cost to the maximum value. v_outside_d = _mm_srli_epi32(v_outside_d, 1); // Compute the difference MV v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv); // We utilise the fact that the cost function is even, and use the // absolute difference. This allows us to use unsigned indexes later // and reduces cache pressure somewhat as only a half of the table // is ever referenced. v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w); // Compute the SIMD pointer offsets. { #if ARCH_X86_64 // sizeof(intptr_t) == 8 // Load the offsets __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]); __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]); // Set the ones falling outside to zero v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d)); v_bo32_q = _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d)); // Compute the candidate addresses v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q); v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q); #else // ARCH_X86 // sizeof(intptr_t) == 4 __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]); v_bo_d = _mm_and_si128(v_bo_d, v_inside_d); v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d); #endif } fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], in_what_stride, (uint32_t*)&v_sad_d); // Look up the component cost of the residual motion vector { const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0); const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1); const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2); const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3); const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4); const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5); const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6); const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7); // Note: This is a use case for vpgather in AVX2 const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0]; const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1]; const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2]; const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3]; __m128i v_cost_10_d, v_cost_32_d; v_cost_10_d = _mm_cvtsi32_si128(cost0); v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1); v_cost_32_d = _mm_cvtsi32_si128(cost2); v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1); v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d); } // Now add in the joint cost { const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128()); const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d); v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d); } // Multiply by sad_per_bit v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d); // ROUND_POWER_OF_TWO(v_cost_d, 8) v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80)); v_cost_d = _mm_srai_epi32(v_cost_d, 8); // Add the cost to the sad v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d); // Make the motion vectors outside the search area have max cost // by or'ing in the comparison mask, this way the minimum search won't // pick them. v_sad_d = _mm_or_si128(v_sad_d, v_outside_d); // Find the minimum value and index horizontally in v_sad_d { // Try speculatively on 16 bits, so we can use the minpos intrinsic const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d); const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w); uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0); uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1); // If the local best value is not saturated, just use it, otherwise // find the horizontal minimum again the hard way on 32 bits. // This is executed rarely. if (__unlikely__(local_best_sad == 0xffff)) { __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; v_loval_d = v_sad_d; v_loidx_d = _mm_set_epi32(3, 2, 1, 0); v_hival_d = _mm_srli_si128(v_loval_d, 8); v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); v_hival_d = _mm_srli_si128(v_loval_d, 4); v_hiidx_d = _mm_srli_si128(v_loidx_d, 4); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); local_best_sad = _mm_extract_epi32(v_loval_d, 0); local_best_idx = _mm_extract_epi32(v_loidx_d, 0); } // Update the global minimum if the local minimum is smaller if (__likely__(local_best_sad < best_sad)) { new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; best_sad = local_best_sad; } } } bmv = new_bmv; best_address = new_best_address; v_bmv_w = _mm_set1_epi32(bmv.as_int); #if ARCH_X86_64 v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif if (__unlikely__(best_address == in_what)) { (*num00)++; } } *best_mv = bmv.as_mv; return best_sad; }
// @return true iff the two pages differ; false otherwise. // @note Uses SSE3, so you must compile with -msse3. bool pagesDifferent (const void * b1, const void * b2) { enum { PAGE_SIZE = 4096 }; // Make a mask, initially all 1's. register __m128i mask = _mm_setzero_si128(); mask = _mm_cmpeq_epi32(mask, mask); __m128i * buf1 = (__m128i *) b1; __m128i * buf2 = (__m128i *) b2; // Some vectorizing pragamata here; not sure if gcc implements them. #pragma vector always for (int i = 0; i < PAGE_SIZE / sizeof(__m128i); i += 8) { #pragma ivdep #pragma vector aligned register __m128i xmm1, xmm2; // Unrolled loop for speed: we load two 128-bit chunks, // and logically AND in their comparison. // If the mask gets any zero bits, the bytes differ. xmm1 = _mm_load_si128 (&buf1[i]); xmm2 = _mm_load_si128 (&buf2[i]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+1]); xmm2 = _mm_load_si128 (&buf2[i+1]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+2]); xmm2 = _mm_load_si128 (&buf2[i+2]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+3]); xmm2 = _mm_load_si128 (&buf2[i+3]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+4]); xmm2 = _mm_load_si128 (&buf2[i+4]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+5]); xmm2 = _mm_load_si128 (&buf2[i+5]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+6]); xmm2 = _mm_load_si128 (&buf2[i+6]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+7]); xmm2 = _mm_load_si128 (&buf2[i+7]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); // Save the mask to see whether we have found a difference or not. unsigned long long buf[128 / sizeof(unsigned long long) / 8] __attribute__((aligned(16))); _mm_store_si128 ((__m128i *) &buf, mask); // IMPORTANT: make sure long long = 64bits! enum { VERIFY_LONGLONG_64 = 1 / (sizeof(long long) == 8) }; // Now check the result. // Both buf[0] and buf[1] should be all ones. if ((buf[0] != (unsigned long long) -1) || (buf[1] != (unsigned long long) -1)) { return true; } } // No differences found. return false; }