// There's no equivalent in libc, you'd think so ... std::mismatch exists, but it's not optimized at all. :( static inline size_t find_change(const uint16_t * a, const uint16_t * b) { const __m128i * a128=(const __m128i*)a; const __m128i * b128=(const __m128i*)b; while (true) { __m128i v0 = _mm_loadu_si128(a128); __m128i v1 = _mm_loadu_si128(b128); __m128i c = _mm_cmpeq_epi32(v0, v1); uint32_t mask = _mm_movemask_epi8(c); a128++; b128++; __m128i v0b = _mm_loadu_si128(a128); __m128i v1b = _mm_loadu_si128(b128); __m128i cb = _mm_cmpeq_epi32(v0b, v1b); uint32_t maskb = _mm_movemask_epi8(cb); if (mask != 0xffff || maskb != 0xffff) // Something has changed, figure out where. { if (mask == 0xffff) mask=maskb; else a128--;//ignore b128 since we'll return anyways size_t ret=(((char*)a128-(char*)a) | (compat_ctz(~mask))) >> 1; return (ret | (a[ret]==b[ret])); } a128++; b128++; } }
QT_BEGIN_NAMESPACE bool convert_ARGB_to_ARGB_PM_inplace_sse2(QImageData *data, Qt::ImageConversionFlags) { Q_ASSERT(data->format == QImage::Format_ARGB32); // extra pixels on each line const int spare = data->width & 3; // width in pixels of the pad at the end of each line const int pad = (data->bytes_per_line >> 2) - data->width; const int iter = data->width >> 2; int height = data->height; const __m128i alphaMask = _mm_set1_epi32(0xff000000); const __m128i nullVector = _mm_setzero_si128(); const __m128i half = _mm_set1_epi16(0x80); const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); __m128i *d = reinterpret_cast<__m128i*>(data->data); while (height--) { const __m128i *end = d + iter; for (; d != end; ++d) { const __m128i srcVector = _mm_loadu_si128(d); const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { // opaque, data is unchanged } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) == 0xffff) { // fully transparent _mm_storeu_si128(d, nullVector); } else { __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); __m128i result; BYTE_MUL_SSE2(result, srcVector, alphaChannel, colorMask, half); result = _mm_or_si128(_mm_andnot_si128(alphaMask, result), srcVectorAlpha); _mm_storeu_si128(d, result); } } QRgb *p = reinterpret_cast<QRgb*>(d); QRgb *pe = p+spare; for (; p != pe; ++p) { if (*p < 0x00ffffff) *p = 0; else if (*p < 0xff000000) *p = PREMUL(*p); } d = reinterpret_cast<__m128i*>(p+pad); } data->format = QImage::Format_ARGB32_Premultiplied; return true; }
const char *ssechr(const char *s, char ch) { __m128i zero = _mm_setzero_si128(); __m128i cx16 = _mm_set1_epi8(ch); // (ch) replicated 16 times. while (1) { __m128i x = _mm_loadu_si128((__m128i const *)s); unsigned u = _mm_movemask_epi8(_mm_cmpeq_epi8(zero, x)); unsigned v = _mm_movemask_epi8(_mm_cmpeq_epi8(cx16, x)) & ~u & (u - 1); if (v) return s + __builtin_ctz(v) - 1; if (u) return NULL; s += 16; } }
static int VectorMismatch_SSE2(const uint32_t* const array1, const uint32_t* const array2, int length) { int match_len; if (length >= 12) { __m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]); __m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]); match_len = 0; do { // Loop unrolling and early load both provide a speedup of 10% for the // current function. Also, max_limit can be MAX_LENGTH=4096 at most. const __m128i cmpA = _mm_cmpeq_epi32(A0, A1); const __m128i B0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); const __m128i B1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); if (_mm_movemask_epi8(cmpA) != 0xffff) break; match_len += 4; { const __m128i cmpB = _mm_cmpeq_epi32(B0, B1); A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); if (_mm_movemask_epi8(cmpB) != 0xffff) break; match_len += 4; } } while (match_len + 12 < length); } else { match_len = 0; // Unroll the potential first two loops. if (length >= 4 && _mm_movemask_epi8(_mm_cmpeq_epi32( _mm_loadu_si128((const __m128i*)&array1[0]), _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) { match_len = 4; if (length >= 8 && _mm_movemask_epi8(_mm_cmpeq_epi32( _mm_loadu_si128((const __m128i*)&array1[4]), _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) { match_len = 8; } } } while (match_len < length && array1[match_len] == array2[match_len]) { ++match_len; } return match_len; }
/*__forceinline*/ bool Cmp_ClutBuffer_GSMem<u32>(u32* GSmem, u32 csa, u32 clutsize) { u64* _GSmem = (u64*) GSmem; u64* clut = (u64*)GetClutBufferAddress<u32>(csa); while(clutsize > 0) { #ifdef ZEROGS_SSE2 // Note: local memory datas are swizzles __m128i GSmem_0 = _mm_load_si128((__m128i*)_GSmem); // 9 8 1 0 __m128i GSmem_1 = _mm_load_si128((__m128i*)_GSmem+1); // 11 10 3 2 __m128i GSmem_2 = _mm_load_si128((__m128i*)_GSmem+2); // 13 12 5 4 __m128i GSmem_3 = _mm_load_si128((__m128i*)_GSmem+3); // 15 14 7 6 __m128i clut_0 = _mm_load_si128((__m128i*)clut); __m128i clut_1 = _mm_load_si128((__m128i*)clut+1); __m128i clut_2 = _mm_load_si128((__m128i*)clut+2); __m128i clut_3 = _mm_load_si128((__m128i*)clut+3); __m128i result = _mm_cmpeq_epi32(_mm_unpacklo_epi64(GSmem_0, GSmem_1), clut_0); __m128i result_tmp = _mm_cmpeq_epi32(_mm_unpacklo_epi64(GSmem_2, GSmem_3), clut_1); result = _mm_and_si128(result, result_tmp); result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(GSmem_0, GSmem_1), clut_2); result = _mm_and_si128(result, result_tmp); result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(GSmem_2, GSmem_3), clut_3); result = _mm_and_si128(result, result_tmp); u32 result_int = _mm_movemask_epi8(result); if (result_int != 0xFFFF) return true; #else // I see no point to keep an mmx version. SSE2 versions is probably faster. // Keep a slow portable C version for reference/debug // Note: local memory datas are swizzles if (clut[0] != _GSmem[0] || clut[1] != _GSmem[2] || clut[2] != _GSmem[4] || clut[3] != _GSmem[6] || clut[4] != _GSmem[1] || clut[5] != _GSmem[3] || clut[6] != _GSmem[5] || clut[7] != _GSmem[7]) return true; #endif // go to the next memory block _GSmem += 32; // go back to the previous memory block then down one memory column if (clutsize & 0x40) { _GSmem -= (64-8); } // In case previous operation (down one column) cross the block boundary // Go to the next block if (clutsize == 0x240) { _GSmem += 32; } clut += 8; clutsize -= 64; } return false; }
bool CPathUtils::ContainsEscapedChars(const char * psz, size_t length) { // most of our strings will be tens of bytes long // -> affort some minor overhead to handle the main part very fast const char* end = psz + length; if (sse2supported) { __m128i mask = _mm_set_epi8 ( '%', '%', '%', '%', '%', '%', '%', '%' , '%', '%', '%', '%', '%', '%', '%', '%'); for (; psz + sizeof (mask) <= end; psz += sizeof (mask)) { // fetch the next 16 bytes from the source __m128i chunk = _mm_loadu_si128 ((const __m128i*)psz); // check for non-ASCII int flags = _mm_movemask_epi8 (_mm_cmpeq_epi8 (chunk, mask)); if (flags != 0) return true; }; } // return odd bytes at the end of the string for (; psz < end; ++psz) if (*psz == '%') return true; return false; }
/* Not so fussy on its alignment */ static void threshold_16_SSE_unaligned(byte *contone_ptr, byte *thresh_ptr, byte *ht_data) { __m128i input1; __m128i input2; int result_int; byte *sse_data; const unsigned int mask1 = 0x80808080; __m128i sign_fix = _mm_set_epi32(mask1, mask1, mask1, mask1); sse_data = (byte*) &(result_int); /* Load */ input1 = _mm_loadu_si128((const __m128i *)contone_ptr); input2 = _mm_loadu_si128((const __m128i *) thresh_ptr); /* Unsigned subtraction does Unsigned saturation so we have to use the signed operation */ input1 = _mm_xor_si128(input1, sign_fix); input2 = _mm_xor_si128(input2, sign_fix); /* Subtract the two */ input2 = _mm_subs_epi8(input1, input2); /* Grab the sign mask */ result_int = _mm_movemask_epi8(input2); /* bit wise reversal on 16 bit word */ ht_data[0] = bitreverse[sse_data[0]]; ht_data[1] = bitreverse[sse_data[1]]; }
/* Shuffle bits within the bytes of eight element blocks. */ int64_t bshuf_shuffle_bit_eightelem_sse2(void* in, void* out, const size_t size, const size_t elem_size) { /* With a bit of care, this could be written such that such that it is */ /* in_buf = out_buf safe. */ char* in_b = (char*) in; uint16_t* out_ui16 = (uint16_t*) out; size_t nbyte = elem_size * size; __m128i xmm; int32_t bt; size_t ii, jj, kk; size_t ind; CHECK_MULT_EIGHT(size); if (elem_size % 2) { bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size); } else { for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) { for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) { xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]); for (kk = 0; kk < 8; kk++) { bt = _mm_movemask_epi8(xmm); xmm = _mm_slli_epi16(xmm, 1); ind = (ii + jj / 8 + (7 - kk) * elem_size); out_ui16[ind / 2] = bt; } } } } return size * elem_size; }
static INLINE unsigned build_mask_linear(int c, int dcdx, int dcdy) { __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); __m128i xdcdy = _mm_set1_epi32(dcdy); /* Get values across the quad */ __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); /* pack pairs of results into epi16 */ __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); /* pack into epi8, preserving sign bits */ __m128i result = _mm_packs_epi16(cstep01, cstep23); /* extract sign bits to create mask */ return _mm_movemask_epi8(result); }
int norx_aead_decrypt( unsigned char *m, size_t *mlen, const unsigned char *a, size_t alen, const unsigned char *c, size_t clen, const unsigned char *z, size_t zlen, const unsigned char *nonce, const unsigned char *key ) { const __m128i K = LOADU(key); __m128i S[4]; if (clen < BYTES(NORX_T)) { return -1; } *mlen = clen - BYTES(NORX_T); INITIALISE(S, nonce, K); ABSORB_DATA(S, a, alen, HEADER_TAG); DECRYPT_DATA(S, m, c, clen - BYTES(NORX_T)); ABSORB_DATA(S, z, zlen, TRAILER_TAG); FINALISE(S, K); /* Verify tag */ S[3] = _mm_cmpeq_epi8(S[3], LOADU(c + clen - BYTES(NORX_T))); return (((_mm_movemask_epi8(S[3]) & 0xFFFFU) + 1) >> 16) - 1; }
/* Transpose bits within bytes. */ int64_t bshuf_trans_bit_byte_sse2(void* in, void* out, const size_t size, const size_t elem_size) { char* in_b = (char*) in; char* out_b = (char*) out; uint16_t* out_ui16; int64_t count; size_t nbyte = elem_size * size; __m128i xmm; int32_t bt; size_t ii, kk; CHECK_MULT_EIGHT(nbyte); for (ii = 0; ii + 15 < nbyte; ii += 16) { xmm = _mm_loadu_si128((__m128i *) &in_b[ii]); for (kk = 0; kk < 8; kk++) { bt = _mm_movemask_epi8(xmm); xmm = _mm_slli_epi16(xmm, 1); out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; *out_ui16 = bt; } } count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, nbyte - nbyte % 16); return count; }
void mandel_sse2(unsigned char *image, const struct spec *s) { __m128 xmin = _mm_set_ps1(s->xlim[0]); __m128 ymin = _mm_set_ps1(s->ylim[0]); __m128 xscale = _mm_set_ps1((s->xlim[1] - s->xlim[0]) / s->width); __m128 yscale = _mm_set_ps1((s->ylim[1] - s->ylim[0]) / s->height); __m128 threshold = _mm_set_ps1(4); __m128 one = _mm_set_ps1(1); __m128i zero = _mm_setzero_si128(); __m128 iter_scale = _mm_set_ps1(1.0f / s->iterations); __m128 depth_scale = _mm_set_ps1(s->depth - 1); #pragma omp parallel for schedule(dynamic, 1) for (int y = 0; y < s->height; y++) { for (int x = 0; x < s->width; x += 4) { __m128 mx = _mm_set_ps(x + 3, x + 2, x + 1, x + 0); __m128 my = _mm_set_ps1(y); __m128 cr = _mm_add_ps(_mm_mul_ps(mx, xscale), xmin); __m128 ci = _mm_add_ps(_mm_mul_ps(my, yscale), ymin); __m128 zr = cr; __m128 zi = ci; int k = 1; __m128 mk = _mm_set_ps1(k); while (++k < s->iterations) { /* Compute z1 from z0 */ __m128 zr2 = _mm_mul_ps(zr, zr); __m128 zi2 = _mm_mul_ps(zi, zi); __m128 zrzi = _mm_mul_ps(zr, zi); /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */ /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */ zr = _mm_add_ps(_mm_sub_ps(zr2, zi2), cr); zi = _mm_add_ps(_mm_add_ps(zrzi, zrzi), ci); /* Increment k */ zr2 = _mm_mul_ps(zr, zr); zi2 = _mm_mul_ps(zi, zi); __m128 mag2 = _mm_add_ps(zr2, zi2); __m128 mask = _mm_cmplt_ps(mag2, threshold); mk = _mm_add_ps(_mm_and_ps(mask, one), mk); /* Early bailout? */ __m128i maski = _mm_castps_si128(mask); if (0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(maski, zero))) break; } mk = _mm_mul_ps(mk, iter_scale); mk = _mm_sqrt_ps(mk); mk = _mm_mul_ps(mk, depth_scale); __m128i pixels = _mm_cvtps_epi32(mk); unsigned char *dst = image + y * s->width * 3 + x * 3; unsigned char *src = (unsigned char *)&pixels; for (int i = 0; i < 4; i++) { dst[i * 3 + 0] = src[i * 4]; dst[i * 3 + 1] = src[i * 4]; dst[i * 3 + 2] = src[i * 4]; } } } }
size_t sse4_strstr_unrolled_max20(const char* s, size_t n, const char* needle, size_t needle_size) { const __m128i zeros = _mm_setzero_si128(); const __m128i prefix = sse::load(needle); const __m128i suffix = sse::load(needle + 4); const __m128i suff_mask = sse::mask_lower_bytes(needle_size - 4); for (size_t i = 0; i < n; i += 8) { const __m128i data = sse::load(s + i); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(result, zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask)/2; const __m128i str = sse::load(s + i + bitpos + 4); const __m128i cmp = _mm_cmpeq_epi8(str, suffix); if (_mm_testc_si128(cmp, suff_mask)) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; }
void WriteBufferValidUTF8::nextImpl() { char * p = memory.data(); char * valid_start = p; while (p < pos) { #ifdef __SSE2__ /// Fast skip of ASCII static constexpr size_t SIMD_BYTES = 16; const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES; while (p < simd_end && !_mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p)))) p += SIMD_BYTES; if (!(p < pos)) break; #endif size_t len = length_of_utf8_sequence[static_cast<unsigned char>(*p)]; if (len > 4) { /// Invalid start of sequence. Skip one byte. putValid(valid_start, p - valid_start); putReplacement(); ++p; valid_start = p; } else if (p + len > pos) { /// Sequence was not fully written to this buffer. break; } else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<unsigned char *>(p), len)) { /// Valid sequence. p += len; } else { /// Invalid sequence. Skip just first byte. putValid(valid_start, p - valid_start); putReplacement(); ++p; valid_start = p; } } putValid(valid_start, p - valid_start); size_t cnt = pos - p; /// Shift unfinished sequence to start of buffer. for (size_t i = 0; i < cnt; ++i) memory[i] = p[i]; working_buffer = Buffer(&memory[cnt], memory.data() + memory.size()); }
int test_mm_movemask_epi8(__m128i A) { // DAG-LABEL: test_mm_movemask_epi8 // DAG: call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %{{.*}}) // // ASM-LABEL: test_mm_movemask_epi8 // ASM: pmovmskb return _mm_movemask_epi8(A); }
void WriteBufferValidUTF8::nextImpl() { char *p = &memory[0]; char *valid_start = p; while (p < pos) { #ifdef __x86_64__ /// Быстрый пропуск ASCII static constexpr size_t SIMD_BYTES = 16; const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES; while (p < simd_end && !_mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p)))) p += SIMD_BYTES; if (!(p < pos)) break; #endif size_t len = 1 + static_cast<size_t>(trailingBytesForUTF8[static_cast<unsigned char>(*p)]); if (len > 4) { /// Невалидное начало последовательности. Пропустим один байт. putValid(valid_start, p - valid_start); putReplacement(); ++p; valid_start = p; } else if (p + len > pos) { /// Еще не вся последовательность записана. break; } else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<unsigned char*>(p), len)) { /// Валидная последовательность. p += len; } else { /// Невалидная последовательность. Пропустим только первый байт. putValid(valid_start, p - valid_start); putReplacement(); ++p; valid_start = p; } } putValid(valid_start, p - valid_start); size_t cnt = pos - p; /// Сдвинем незаконченную последовательность в начало буфера. for (size_t i = 0; i < cnt; ++i) memory[i] = p[i]; working_buffer = Buffer(&memory[cnt], &memory[0] + memory.size()); }
static INLINE void build_masks(int c, int cdiff, int dcdx, int dcdy, unsigned *outmask, unsigned *partmask) { __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); __m128i xdcdy = _mm_set1_epi32(dcdy); /* Get values across the quad */ __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); { __m128i cstep01, cstep23, result; cstep01 = _mm_packs_epi32(cstep0, cstep1); cstep23 = _mm_packs_epi32(cstep2, cstep3); result = _mm_packs_epi16(cstep01, cstep23); *outmask |= _mm_movemask_epi8(result); } { __m128i cio4 = _mm_set1_epi32(cdiff); __m128i cstep01, cstep23, result; cstep0 = _mm_add_epi32(cstep0, cio4); cstep1 = _mm_add_epi32(cstep1, cio4); cstep2 = _mm_add_epi32(cstep2, cio4); cstep3 = _mm_add_epi32(cstep3, cio4); cstep01 = _mm_packs_epi32(cstep0, cstep1); cstep23 = _mm_packs_epi32(cstep2, cstep3); result = _mm_packs_epi16(cstep01, cstep23); *partmask |= _mm_movemask_epi8(result); } }
int64_t av1_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bps) { int i, j, test; uint32_t temp[4]; __m128i max, min, cmp0, cmp1, cmp2, cmp3; int64_t error = 0, sqcoeff = 0; const int shift = 2 * (bps - 8); const int rounding = shift > 0 ? 1 << (shift - 1) : 0; for (i = 0; i < block_size; i += 8) { // Load the data into xmm registers __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i)); __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4)); __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i)); __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4)); // Check if any values require more than 15 bit max = _mm_set1_epi32(0x3fff); min = _mm_set1_epi32(0xffffc000); cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max), _mm_cmplt_epi32(mm_coeff, min)); cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max), _mm_cmplt_epi32(mm_coeff2, min)); cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max), _mm_cmplt_epi32(mm_dqcoeff, min)); cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max), _mm_cmplt_epi32(mm_dqcoeff2, min)); test = _mm_movemask_epi8( _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3))); if (!test) { __m128i mm_diff, error_sse2, sqcoeff_sse2; mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2); mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2); mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff); error_sse2 = _mm_madd_epi16(mm_diff, mm_diff); sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff); _mm_storeu_si128((__m128i *)temp, error_sse2); error = error + temp[0] + temp[1] + temp[2] + temp[3]; _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2); sqcoeff += temp[0] + temp[1] + temp[2] + temp[3]; } else { for (j = 0; j < 8; j++) { const int64_t diff = coeff[i + j] - dqcoeff[i + j]; error += diff * diff; sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j]; } } } assert(error >= 0 && sqcoeff >= 0); error = (error + rounding) >> shift; sqcoeff = (sqcoeff + rounding) >> shift; *ssz = sqcoeff; return error; }
void* memrchr(void *dst, int c, size_t len) { /* Backwards */ uint8_t* a = dst; if(!len) return NULL; int i = len; int aligned_a = 0; aligned_a = ((uintptr_t)a & (sizeof(__m128i) - 1)); /* aligned */ if(aligned_a) { while(i && ((uintptr_t) &a[i] & ( sizeof(__m128i)-1))) { i--; if(a[i] == (char)c) { return a + i; } } } if(i >= 16) { uint32_t buf_32 = c; buf_32 |= (buf_32 << 8); buf_32 |= (buf_32 << 16); __m128i r1 = _mm_set_epi32(buf_32, buf_32, buf_32, buf_32); while(i >= 16) { i -= 16; __m128i x = _mm_loadu_si128((__m128i*)&(a[i])); //16byte __m128i cmp = _mm_cmpeq_epi8(x, r1); uint16_t result = (uint16_t)_mm_movemask_epi8(cmp); if(result != 0x0000U) { i += 15; while(!(result & 0x8000)) { result = result << 1; i--; } return a + i; } } } while(i) { i--; if(a[i] == (char)c) { return a + i; } } return NULL; }
int SSEBinSearchBlock::search(uint32_t key) const { const __m128i keys = _mm_set1_epi32(key); __m128i v; int limit = data.size() - 1; int a = 0; int b = limit; while (a <= b) { const int c = (a + b)/2; if (data[c] == key) { return c; } if (key < data[c]) { b = c - 1; if (b >= 4) { v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&data[b - 4])); v = _mm_cmpeq_epi32(v, keys); const uint16_t mask = _mm_movemask_epi8(v); if (mask) { return b - 4 + __builtin_ctz(mask)/4; } } } else { a = c + 1; if (a + 4 < limit) { v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&data[a])); v = _mm_cmpeq_epi32(v, keys); const uint16_t mask = _mm_movemask_epi8(v); if (mask) { return a + __builtin_ctz(mask)/4; } } } } return -1; }
virtual size_t match(const char* data, size_t size) { __m128i firstLetter = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->firstLetter)); __m128i patternData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->patternData)); __m128i patternMask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->patternMask)); size_t offset = firstLetterPos; while (offset + 32 <= size) { __m128i value = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + offset)); unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(value, firstLetter)); // advance offset regardless of match results to reduce number of live values offset += 16; while (mask != 0) { unsigned int pos = re2::countTrailingZeros(mask); size_t dataOffset = offset - 16 + pos - firstLetterOffset; mask &= ~(1 << pos); // check if we have a match __m128i patternMatch = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + dataOffset)); __m128i matchMask = _mm_or_si128(patternMask, _mm_cmpeq_epi8(patternMatch, patternData)); if (_mm_movemask_epi8(matchMask) == 0xffff) { size_t matchOffset = dataOffset + firstLetterOffset - firstLetterPos; // final check for full pattern if (matchOffset + pattern.size() < size && memcmp(data + matchOffset, pattern.c_str(), pattern.size()) == 0) { return matchOffset; } } } } return findMatch(pattern.c_str(), pattern.size(), data, size, offset - firstLetterPos); }
inline void matrix16x8::transpose(square128& output, int x, int y) { for (int j = 0; j < 8; j++) { int row = _mm_movemask_epi8(whole); whole = _mm_slli_epi64(whole, 1); // _mm_movemask_epi8 uses most significant bit, hence +7-j output.doublebytes[8*x+7-j][y] = row; } }
int valuesOK(cl_float8* to, cl_float8* from, size_t length) { #ifdef DEBUG printf("Checking data of size: %lu\n", length); #endif for(int i = 0; i < length; ++i) { #ifdef __SSE__ __cl_float4 __hostFirstValue = to->v4[0]; __cl_float4 __hostSecondValue = to->v4[1]; __cl_float4 __deviceFirstValue = from->v4[0]; __cl_float4 __deviceSecondValue = from->v4[1]; __m128i vcmp = (__m128i) _mm_cmpneq_ps(__hostFirstValue, __deviceFirstValue); uint16_t test = _mm_movemask_epi8(vcmp); __m128i vcmp_2 = (__m128i) _mm_cmpneq_ps(__hostSecondValue, __deviceSecondValue); uint16_t test_2 = _mm_movemask_epi8(vcmp_2); if( (test|test_2) != 0 ) return 0; // indicative that the result failed #else #error "SSE not supported, which is required for example code to work!" #endif } return 1; }
static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, int width, int height, uint8_t* dst, int dst_stride) { // alpha_and stores an 'and' operation of all the alpha[] values. The final // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; int i, j; const __m128i zero = _mm_setzero_si128(); const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u); // to preserve RGB const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); __m128i all_alphas = all_0xff; // We must be able to access 3 extra bytes after the last written byte // 'dst[4 * width - 4]', because we don't know if alpha is the first or the // last byte of the quadruplet. const int limit = (width - 1) & ~7; for (j = 0; j < height; ++j) { __m128i* out = (__m128i*)dst; for (i = 0; i < limit; i += 8) { // load 8 alpha bytes const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]); const __m128i a1 = _mm_unpacklo_epi8(a0, zero); const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); // load 8 dst pixels (32 bytes) const __m128i b0_lo = _mm_loadu_si128(out + 0); const __m128i b0_hi = _mm_loadu_si128(out + 1); // mask dst alpha values const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask); const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask); // combine const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo); const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi); // store _mm_storeu_si128(out + 0, b2_lo); _mm_storeu_si128(out + 1, b2_hi); // accumulate eight alpha 'and' in parallel all_alphas = _mm_and_si128(all_alphas, a0); out += 2; } for (; i < width; ++i) { const uint32_t alpha_value = alpha[i]; dst[4 * i] = alpha_value; alpha_and &= alpha_value; } alpha += alpha_stride; dst += dst_stride; } // Combine the eight alpha 'and' into a 8-bit mask. alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); return (alpha_and != 0xff); }
static void SetResidualCoeffsSSE2(const int16_t* const coeffs, VP8Residual* const res) { const __m128i c0 = _mm_loadu_si128((const __m128i*)coeffs); const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8)); // Use SSE to compare 8 values with a single instruction. const __m128i zero = _mm_setzero_si128(); const __m128i m0 = _mm_cmpeq_epi16(c0, zero); const __m128i m1 = _mm_cmpeq_epi16(c1, zero); // Get the comparison results as a bitmask, consisting of two times 16 bits: // two identical bits for each result. Concatenate both bitmasks to get a // single 32 bit value. Negate the mask to get the position of entries that // are not equal to zero. We don't need to mask out least significant bits // according to res->first, since coeffs[0] is 0 if res->first > 0 const uint32_t mask = ~(((uint32_t)_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0)); // The position of the most significant non-zero bit indicates the position of // the last non-zero value. Divide the result by two because __movemask_epi8 // operates on 8 bit values instead of 16 bit values. assert(res->first == 0 || coeffs[0] == 0); res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1; res->coeffs = coeffs; }
static float CombinedShannonEntropy(const int X[256], const int Y[256]) { int i; double retval = 0.; int sumX, sumXY; int32_t tmp[4]; __m128i zero = _mm_setzero_si128(); // Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY). __m128i sumXY_128 = zero; __m128i sumX_128 = zero; for (i = 0; i < 256; i += 4) { const __m128i x = _mm_loadu_si128((const __m128i*)(X + i)); const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i)); // Check if any X is non-zero: this actually provides a speedup as X is // usually sparse. if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) { const __m128i xy_128 = _mm_add_epi32(x, y); sumXY_128 = _mm_add_epi32(sumXY_128, xy_128); sumX_128 = _mm_add_epi32(sumX_128, x); // Analyze the different X + Y. _mm_storeu_si128((__m128i*)tmp, xy_128); ANALYZE_XY(0); ANALYZE_XY(1); ANALYZE_XY(2); ANALYZE_XY(3); } else { // X is fully 0, so only deal with Y. sumXY_128 = _mm_add_epi32(sumXY_128, y); ANALYZE_X_OR_Y(Y, 0); ANALYZE_X_OR_Y(Y, 1); ANALYZE_X_OR_Y(Y, 2); ANALYZE_X_OR_Y(Y, 3); } } // Sum up sumX_128 to get sumX. _mm_storeu_si128((__m128i*)tmp, sumX_128); sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0]; // Sum up sumXY_128 to get sumXY. _mm_storeu_si128((__m128i*)tmp, sumXY_128); sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0]; retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY); return (float)retval; }
virtual size_t match(const char* data, size_t size) { __m128i firstLetter = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->firstLetter)); __m128i patternData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->patternData)); __m128i patternMask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->patternMask)); size_t offset = firstLetterPos; while (offset + 32 <= size) { __m128i value = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + offset)); int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(value, firstLetter)); if (mask == 0) offset += 16; else { offset += re2::countTrailingZeros(mask); // check if we have a match __m128i patternMatch = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + offset - firstLetterOffset)); __m128i matchMask = _mm_or_si128(patternMask, _mm_cmpeq_epi8(patternMatch, patternData)); if (_mm_movemask_epi8(matchMask) == 0xffff) { // final check for full pattern if (memcmp(data + offset - firstLetterPos, pattern.c_str(), pattern.size()) == 0) { return offset - firstLetterPos; } } offset += 1; } } return findMatch(pattern.c_str(), pattern.size(), data, size, offset - firstLetterPos); }
size_t sse4_strstr_unrolled_max36(const char* s, size_t n, const char* needle, size_t needle_size) { const __m128i zeros = _mm_setzero_si128(); const __m128i prefix = sse::load(needle); const __m128i suffix1 = sse::load(needle + 4); const __m128i suffix2 = sse::load(needle + 16 + 4); const __m128i suff_mask = sse::mask_higher_bytes(needle_size - (16 + 4)); for (size_t i = 0; i < n; i += 8) { const __m128i data = sse::load(s + i); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(result, zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask)/2; const __m128i c1 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 4), suffix1); const __m128i c2 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 16 + 4), suffix2); const __m128i c3 = _mm_or_si128(c2, suff_mask); const __m128i tmp = _mm_and_si128(c1, c3); if (_mm_movemask_epi8(tmp) == 0xffff) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; }
bool test(const index_t & kmer) const { __m128 __attribute__ ((aligned (16))) zero = _mm_setzero_si128(); const size_t BitsPerElement = sizeof(block_t) * 8; const Hash hashfunction = Hash(); kmer_t hashvalue = kmer; for (int hcount = this->h; hcount > 0; hcount--) { hashvalue = hashfunction(hashvalue); // we expect the compiler to automatically turn this into a shift because it's a const power of two size_t offset = (hashvalue % this->m) / BitsPerElement; if (_mm_movemask_epi8( _mm_cmpeq_epi32( _mm_and_ps(bitarray[offset],masks[hashvalue & (BitsPerElement-1)]),zero)) != 0xFFFF) return false; } return true; }
int countZeroBytes_SSE(char* values, int length) { int zeroCount = 0; __m128i zero16 = _mm_set1_epi8(0); __m128i and16 = _mm_set1_epi8(1); for(int i=0; i<length; i+=16) { __m128i values16 = _mm_loadu_si128((__m128i*)&values[i]); __m128i cmp = _mm_cmpeq_epi8(values16, zero16); if(_mm_movemask_epi8(cmp)) { cmp = _mm_and_si128(and16, cmp); //change -1 values to 1 //hortiontal sum of 16 bytes __m128i sum1 = _mm_sad_epu8(cmp,zero16); __m128i sum2 = _mm_shuffle_epi32(sum1,2); __m128i sum3 = _mm_add_epi16(sum1,sum2); zeroCount += _mm_cvtsi128_si32(sum3); } } return zeroCount; }