int norx_aead_decrypt( unsigned char *m, size_t *mlen, const unsigned char *a, size_t alen, const unsigned char *c, size_t clen, const unsigned char *z, size_t zlen, const unsigned char *nonce, const unsigned char *key ) { const __m128i K0 = LOADU(key + 0); const __m128i K1 = LOADU(key + 16); __m128i S[8]; if (clen < BYTES(NORX_T)) { return -1; } *mlen = clen - BYTES(NORX_T); INITIALISE(S, nonce, K0, K1); ABSORB_DATA(S, a, alen, HEADER_TAG); DECRYPT_DATA(S, m, c, clen - BYTES(NORX_T)); ABSORB_DATA(S, z, zlen, TRAILER_TAG); FINALISE(S, K0, K1); /* Verify tag */ S[6] = _mm_cmpeq_epi8(S[6], LOADU(c + clen - BYTES(NORX_T) )); S[7] = _mm_cmpeq_epi8(S[7], LOADU(c + clen - BYTES(NORX_T)/2)); return (((_mm_movemask_epi8(AND(S[6], S[7])) & 0xFFFFUL) + 1) >> 16) - 1; }
const char *ssechr(const char *s, char ch) { __m128i zero = _mm_setzero_si128(); __m128i cx16 = _mm_set1_epi8(ch); // (ch) replicated 16 times. while (1) { __m128i x = _mm_loadu_si128((__m128i const *)s); unsigned u = _mm_movemask_epi8(_mm_cmpeq_epi8(zero, x)); unsigned v = _mm_movemask_epi8(_mm_cmpeq_epi8(cx16, x)) & ~u & (u - 1); if (v) return s + __builtin_ctz(v) - 1; if (u) return NULL; s += 16; } }
SIMD_INLINE bool RowHasIndex(const uint8_t * mask, size_t alignedSize, size_t fullSize, __m128i index) { for (size_t col = 0; col < alignedSize; col += A) { if(!_mm_testz_si128(_mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)(mask + col)), index), K_INV_ZERO)) return true; } if(alignedSize != fullSize) { if(!_mm_testz_si128(_mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)(mask + fullSize - A)), index), K_INV_ZERO)) return true; } return false; }
size_t sse4_strstr_unrolled_max20(const char* s, size_t n, const char* needle, size_t needle_size) { const __m128i zeros = _mm_setzero_si128(); const __m128i prefix = sse::load(needle); const __m128i suffix = sse::load(needle + 4); const __m128i suff_mask = sse::mask_lower_bytes(needle_size - 4); for (size_t i = 0; i < n; i += 8) { const __m128i data = sse::load(s + i); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(result, zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask)/2; const __m128i str = sse::load(s + i + bitpos + 4); const __m128i cmp = _mm_cmpeq_epi8(str, suffix); if (_mm_testc_si128(cmp, suff_mask)) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; }
int norx_aead_decrypt( unsigned char *m, size_t *mlen, const unsigned char *a, size_t alen, const unsigned char *c, size_t clen, const unsigned char *z, size_t zlen, const unsigned char *nonce, const unsigned char *key ) { __m128i S[4]; if (clen < BYTES(NORX_T)) { return -1; } *mlen = clen - BYTES(NORX_T); INITIALISE(S, nonce, key); ABSORB_DATA(S, a, alen, HEADER_TAG); DECRYPT_DATA(S, m, c, clen - BYTES(NORX_T)); ABSORB_DATA(S, z, zlen, TRAILER_TAG); FINALISE(S); /* Verify tag */ S[0] = _mm_cmpeq_epi8(S[0], LOADU(c + clen - BYTES(NORX_T))); return (((_mm_movemask_epi8(S[0]) & 0xFFFFU) + 1) >> 16) - 1; }
bool CPathUtils::ContainsEscapedChars(const char * psz, size_t length) { // most of our strings will be tens of bytes long // -> affort some minor overhead to handle the main part very fast const char* end = psz + length; if (sse2supported) { __m128i mask = _mm_set_epi8 ( '%', '%', '%', '%', '%', '%', '%', '%' , '%', '%', '%', '%', '%', '%', '%', '%'); for (; psz + sizeof (mask) <= end; psz += sizeof (mask)) { // fetch the next 16 bytes from the source __m128i chunk = _mm_loadu_si128 ((const __m128i*)psz); // check for non-ASCII int flags = _mm_movemask_epi8 (_mm_cmpeq_epi8 (chunk, mask)); if (flags != 0) return true; }; } // return odd bytes at the end of the string for (; psz < end; ++psz) if (*psz == '%') return true; return false; }
void mandel_sse2(unsigned char *image, const struct spec *s) { __m128 xmin = _mm_set_ps1(s->xlim[0]); __m128 ymin = _mm_set_ps1(s->ylim[0]); __m128 xscale = _mm_set_ps1((s->xlim[1] - s->xlim[0]) / s->width); __m128 yscale = _mm_set_ps1((s->ylim[1] - s->ylim[0]) / s->height); __m128 threshold = _mm_set_ps1(4); __m128 one = _mm_set_ps1(1); __m128i zero = _mm_setzero_si128(); __m128 iter_scale = _mm_set_ps1(1.0f / s->iterations); __m128 depth_scale = _mm_set_ps1(s->depth - 1); #pragma omp parallel for schedule(dynamic, 1) for (int y = 0; y < s->height; y++) { for (int x = 0; x < s->width; x += 4) { __m128 mx = _mm_set_ps(x + 3, x + 2, x + 1, x + 0); __m128 my = _mm_set_ps1(y); __m128 cr = _mm_add_ps(_mm_mul_ps(mx, xscale), xmin); __m128 ci = _mm_add_ps(_mm_mul_ps(my, yscale), ymin); __m128 zr = cr; __m128 zi = ci; int k = 1; __m128 mk = _mm_set_ps1(k); while (++k < s->iterations) { /* Compute z1 from z0 */ __m128 zr2 = _mm_mul_ps(zr, zr); __m128 zi2 = _mm_mul_ps(zi, zi); __m128 zrzi = _mm_mul_ps(zr, zi); /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */ /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */ zr = _mm_add_ps(_mm_sub_ps(zr2, zi2), cr); zi = _mm_add_ps(_mm_add_ps(zrzi, zrzi), ci); /* Increment k */ zr2 = _mm_mul_ps(zr, zr); zi2 = _mm_mul_ps(zi, zi); __m128 mag2 = _mm_add_ps(zr2, zi2); __m128 mask = _mm_cmplt_ps(mag2, threshold); mk = _mm_add_ps(_mm_and_ps(mask, one), mk); /* Early bailout? */ __m128i maski = _mm_castps_si128(mask); if (0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(maski, zero))) break; } mk = _mm_mul_ps(mk, iter_scale); mk = _mm_sqrt_ps(mk); mk = _mm_mul_ps(mk, depth_scale); __m128i pixels = _mm_cvtps_epi32(mk); unsigned char *dst = image + y * s->width * 3 + x * 3; unsigned char *src = (unsigned char *)&pixels; for (int i = 0; i < 4; i++) { dst[i * 3 + 0] = src[i * 4]; dst[i * 3 + 1] = src[i * 4]; dst[i * 3 + 2] = src[i * 4]; } } } }
// Denoise a 16x1 vector with a weaker filter. static INLINE __m128i vp9_denoiser_adj_16x1_sse2( const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, const __m128i k_0, const __m128i k_delta, __m128i acc_diff) { __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0])); // Calculate differences. const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); // Obtain the sign. FF if diff is negative. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); // Clamp absolute difference to delta to get the adjustment. const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); // Restore the sign and get positive and negative adjustments. __m128i padj, nadj; padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); // Calculate filtered value. v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj); v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); // Accumulate the adjustments. acc_diff = _mm_subs_epi8(acc_diff, padj); acc_diff = _mm_adds_epi8(acc_diff, nadj); return acc_diff; }
template<> void copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size) { for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep ) { const uchar* src = (const uchar*)_src; uchar* dst = (uchar*)_dst; int x = 0; #if CV_SSE4_2 if(USE_SSE4_2)// { __m128i zero = _mm_setzero_si128 (); for( ; x <= size.width - 16; x += 16 ) { const __m128i rSrc = _mm_lddqu_si128((const __m128i*)(src+x)); __m128i _mask = _mm_lddqu_si128((const __m128i*)(mask+x)); __m128i rDst = _mm_lddqu_si128((__m128i*)(dst+x)); __m128i _negMask = _mm_cmpeq_epi8(_mask, zero); rDst = _mm_blendv_epi8(rSrc, rDst, _negMask); _mm_storeu_si128((__m128i*)(dst + x), rDst); } } #endif for( ; x < size.width; x++ ) if( mask[x] ) dst[x] = src[x]; } }
__m128i test_mm_cmpeq_epi8(__m128i A, __m128i B) { // DAG-LABEL: test_mm_cmpeq_epi8 // DAG: icmp eq <16 x i8> // // ASM-LABEL: test_mm_cmpeq_epi8 // ASM: pcmpeqb return _mm_cmpeq_epi8(A, B); }
SIMDValue SIMDInt8x16Operation::OpEqual(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_cmpeq_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a == b? return X86SIMDValue::ToSIMDValue(x86Result); }
void* memrchr(void *dst, int c, size_t len) { /* Backwards */ uint8_t* a = dst; if(!len) return NULL; int i = len; int aligned_a = 0; aligned_a = ((uintptr_t)a & (sizeof(__m128i) - 1)); /* aligned */ if(aligned_a) { while(i && ((uintptr_t) &a[i] & ( sizeof(__m128i)-1))) { i--; if(a[i] == (char)c) { return a + i; } } } if(i >= 16) { uint32_t buf_32 = c; buf_32 |= (buf_32 << 8); buf_32 |= (buf_32 << 16); __m128i r1 = _mm_set_epi32(buf_32, buf_32, buf_32, buf_32); while(i >= 16) { i -= 16; __m128i x = _mm_loadu_si128((__m128i*)&(a[i])); //16byte __m128i cmp = _mm_cmpeq_epi8(x, r1); uint16_t result = (uint16_t)_mm_movemask_epi8(cmp); if(result != 0x0000U) { i += 15; while(!(result & 0x8000)) { result = result << 1; i--; } return a + i; } } } while(i) { i--; if(a[i] == (char)c) { return a + i; } } return NULL; }
void FREAK::extractDescriptor(uchar *pointsValue, void ** ptr) const { __m128i** ptrSSE = (__m128i**) ptr; // note that comparisons order is modified in each block (but first 128 comparisons remain globally the same-->does not affect the 128,384 bits segmanted matching strategy) int cnt = 0; for( int n = FREAK_NB_PAIRS/128; n-- ; ) { __m128i result128 = _mm_setzero_si128(); for( int m = 128/16; m--; cnt += 16 ) { __m128i operand1 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].i], pointsValue[descriptionPairs[cnt+1].i], pointsValue[descriptionPairs[cnt+2].i], pointsValue[descriptionPairs[cnt+3].i], pointsValue[descriptionPairs[cnt+4].i], pointsValue[descriptionPairs[cnt+5].i], pointsValue[descriptionPairs[cnt+6].i], pointsValue[descriptionPairs[cnt+7].i], pointsValue[descriptionPairs[cnt+8].i], pointsValue[descriptionPairs[cnt+9].i], pointsValue[descriptionPairs[cnt+10].i], pointsValue[descriptionPairs[cnt+11].i], pointsValue[descriptionPairs[cnt+12].i], pointsValue[descriptionPairs[cnt+13].i], pointsValue[descriptionPairs[cnt+14].i], pointsValue[descriptionPairs[cnt+15].i]); __m128i operand2 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].j], pointsValue[descriptionPairs[cnt+1].j], pointsValue[descriptionPairs[cnt+2].j], pointsValue[descriptionPairs[cnt+3].j], pointsValue[descriptionPairs[cnt+4].j], pointsValue[descriptionPairs[cnt+5].j], pointsValue[descriptionPairs[cnt+6].j], pointsValue[descriptionPairs[cnt+7].j], pointsValue[descriptionPairs[cnt+8].j], pointsValue[descriptionPairs[cnt+9].j], pointsValue[descriptionPairs[cnt+10].j], pointsValue[descriptionPairs[cnt+11].j], pointsValue[descriptionPairs[cnt+12].j], pointsValue[descriptionPairs[cnt+13].j], pointsValue[descriptionPairs[cnt+14].j], pointsValue[descriptionPairs[cnt+15].j]); __m128i workReg = _mm_min_epu8(operand1, operand2); // emulated "not less than" for 8-bit UNSIGNED integers workReg = _mm_cmpeq_epi8(workReg, operand2); // emulated "not less than" for 8-bit UNSIGNED integers workReg = _mm_and_si128(_mm_set1_epi16(short(0x8080 >> m)), workReg); // merge the last 16 bits with the 128bits std::vector until full result128 = _mm_or_si128(result128, workReg); } (**ptrSSE) = result128; ++(*ptrSSE); } (*ptrSSE) -= 8; }
SIMDValue SIMDInt8x16Operation::OpGreaterThanOrEqual(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_max_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value); // max(a,b) == b x86Result.m128i_value = _mm_cmpeq_epi8(tmpaValue.m128i_value, x86Result.m128i_value); // return X86SIMDValue::ToSIMDValue(x86Result); }
virtual size_t match(const char* data, size_t size) { __m128i firstLetter = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->firstLetter)); __m128i patternData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->patternData)); __m128i patternMask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->patternMask)); size_t offset = firstLetterPos; while (offset + 32 <= size) { __m128i value = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + offset)); unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(value, firstLetter)); // advance offset regardless of match results to reduce number of live values offset += 16; while (mask != 0) { unsigned int pos = re2::countTrailingZeros(mask); size_t dataOffset = offset - 16 + pos - firstLetterOffset; mask &= ~(1 << pos); // check if we have a match __m128i patternMatch = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + dataOffset)); __m128i matchMask = _mm_or_si128(patternMask, _mm_cmpeq_epi8(patternMatch, patternData)); if (_mm_movemask_epi8(matchMask) == 0xffff) { size_t matchOffset = dataOffset + firstLetterOffset - firstLetterPos; // final check for full pattern if (matchOffset + pattern.size() < size && memcmp(data + matchOffset, pattern.c_str(), pattern.size()) == 0) { return matchOffset; } } } } return findMatch(pattern.c_str(), pattern.size(), data, size, offset - firstLetterPos); }
SIMD_INLINE bool ColsHasIndex(const uint8_t * mask, size_t stride, size_t size, __m128i index, uint8_t * cols) { __m128i _cols = _mm_setzero_si128(); for (size_t row = 0; row < size; ++row) { _cols = _mm_or_si128(_cols, _mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)mask), index)); mask += stride; } _mm_storeu_si128((__m128i*)cols, _cols); return !_mm_testz_si128(_cols, K_INV_ZERO); }
__m64 _m_pcmpeqb(__m64 _MM1, __m64 _MM2) { __m128i lhs = {0}, rhs = {0}; lhs.m128i_i64[0] = _MM1.m64_i64; rhs.m128i_i64[0] = _MM2.m64_i64; lhs = _mm_cmpeq_epi8(lhs, rhs); _MM1.m64_i64 = lhs.m128i_i64[0]; return _MM1; }
SIMDValue SIMDInt8x16Operation::OpNotEqual(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_cmpeq_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a != b? X86SIMDValue negativeOnes = { { -1, -1, -1, -1 } }; x86Result.m128i_value = _mm_andnot_si128(x86Result.m128i_value, negativeOnes.m128i_value); return X86SIMDValue::ToSIMDValue(x86Result); }
static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, int width, int height, uint8_t* dst, int dst_stride) { // alpha_and stores an 'and' operation of all the alpha[] values. The final // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; int i, j; const __m128i zero = _mm_setzero_si128(); const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u); // to preserve RGB const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); __m128i all_alphas = all_0xff; // We must be able to access 3 extra bytes after the last written byte // 'dst[4 * width - 4]', because we don't know if alpha is the first or the // last byte of the quadruplet. const int limit = (width - 1) & ~7; for (j = 0; j < height; ++j) { __m128i* out = (__m128i*)dst; for (i = 0; i < limit; i += 8) { // load 8 alpha bytes const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]); const __m128i a1 = _mm_unpacklo_epi8(a0, zero); const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); // load 8 dst pixels (32 bytes) const __m128i b0_lo = _mm_loadu_si128(out + 0); const __m128i b0_hi = _mm_loadu_si128(out + 1); // mask dst alpha values const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask); const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask); // combine const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo); const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi); // store _mm_storeu_si128(out + 0, b2_lo); _mm_storeu_si128(out + 1, b2_hi); // accumulate eight alpha 'and' in parallel all_alphas = _mm_and_si128(all_alphas, a0); out += 2; } for (; i < width; ++i) { const uint32_t alpha_value = alpha[i]; dst[4 * i] = alpha_value; alpha_and &= alpha_value; } alpha += alpha_stride; dst += dst_stride; } // Combine the eight alpha 'and' into a 8-bit mask. alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); return (alpha_and != 0xff); }
static WEBP_INLINE void ComplexMask(const __m128i* const p1, const __m128i* const p0, const __m128i* const q0, const __m128i* const q1, int thresh, int ithresh, __m128i* const mask) { const __m128i it = _mm_set1_epi8(ithresh); const __m128i diff = _mm_subs_epu8(*mask, it); const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128()); __m128i filter_mask; NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask); *mask = _mm_and_si128(thresh_mask, filter_mask); }
virtual size_t match(const char* data, size_t size) { __m128i firstLetter = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->firstLetter)); __m128i patternData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->patternData)); __m128i patternMask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->patternMask)); size_t offset = firstLetterPos; while (offset + 32 <= size) { __m128i value = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + offset)); int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(value, firstLetter)); if (mask == 0) offset += 16; else { offset += re2::countTrailingZeros(mask); // check if we have a match __m128i patternMatch = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + offset - firstLetterOffset)); __m128i matchMask = _mm_or_si128(patternMask, _mm_cmpeq_epi8(patternMatch, patternData)); if (_mm_movemask_epi8(matchMask) == 0xffff) { // final check for full pattern if (memcmp(data + offset - firstLetterPos, pattern.c_str(), pattern.size()) == 0) { return offset - firstLetterPos; } } offset += 1; } } return findMatch(pattern.c_str(), pattern.size(), data, size, offset - firstLetterPos); }
size_t sse4_strstr_unrolled_max36(const char* s, size_t n, const char* needle, size_t needle_size) { const __m128i zeros = _mm_setzero_si128(); const __m128i prefix = sse::load(needle); const __m128i suffix1 = sse::load(needle + 4); const __m128i suffix2 = sse::load(needle + 16 + 4); const __m128i suff_mask = sse::mask_higher_bytes(needle_size - (16 + 4)); for (size_t i = 0; i < n; i += 8) { const __m128i data = sse::load(s + i); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(result, zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask)/2; const __m128i c1 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 4), suffix1); const __m128i c2 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 16 + 4), suffix2); const __m128i c3 = _mm_or_si128(c2, suff_mask); const __m128i tmp = _mm_and_si128(c1, c3); if (_mm_movemask_epi8(tmp) == 0xffff) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; }
// Denoise a 16x1 vector. static INLINE __m128i vp9_denoiser_16x1_sse2( const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, const __m128i *k_0, const __m128i *k_4, const __m128i *k_8, const __m128i *k_16, const __m128i *l3, const __m128i *l32, const __m128i *l21, __m128i acc_diff) { // Calculate differences const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); __m128i v_running_avg_y; const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); // Obtain the sign. FF if diff is negative. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0); // Clamp absolute difference to 16 to be used to get mask. Doing this // allows us to use _mm_cmpgt_epi8, which operates on signed byte. const __m128i clamped_absdiff = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16); // Get masks for l2 l1 and l0 adjustments. const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff); const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff); const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff); // Get adjustments for l2, l1, and l0. __m128i adj2 = _mm_and_si128(mask2, *l32); const __m128i adj1 = _mm_and_si128(mask1, *l21); const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); __m128i adj, padj, nadj; // Combine the adjustments and get absolute adjustments. adj2 = _mm_add_epi8(adj2, adj1); adj = _mm_sub_epi8(*l3, adj2); adj = _mm_andnot_si128(mask0, adj); adj = _mm_or_si128(adj, adj0); // Restore the sign and get positive and negative adjustments. padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); // Calculate filtered value. v_running_avg_y = _mm_adds_epu8(v_sig, padj); v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); // Adjustments <=7, and each element in acc_diff can fit in signed // char. acc_diff = _mm_adds_epi8(acc_diff, padj); acc_diff = _mm_subs_epi8(acc_diff, nadj); return acc_diff; }
static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0, const __m128i* q1, int thresh, __m128i *mask) { __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1) *mask = _mm_set1_epi8(0xFE); t1 = _mm_and_si128(t1, *mask); // set lsb of each byte to zero t1 = _mm_srli_epi16(t1, 1); // abs(p1 - q1) / 2 *mask = MM_ABS(*p0, *q0); // abs(p0 - q0) *mask = _mm_adds_epu8(*mask, *mask); // abs(p0 - q0) * 2 *mask = _mm_adds_epu8(*mask, t1); // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 t1 = _mm_set1_epi8(thresh); *mask = _mm_subs_epu8(*mask, t1); // mask <= thresh *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128()); }
SIMDValue SIMDUint8x16Operation::OpLessThanOrEqual(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); X86SIMDValue signBits = { { 0x80808080,0x80808080, 0x80808080, 0x80808080 } }; // Signed comparison of unsigned ints can be done if the ints have the "sign" bit xored with 1 tmpaValue.m128i_value = _mm_xor_si128(tmpaValue.m128i_value, signBits.m128i_value); tmpbValue.m128i_value = _mm_xor_si128(tmpbValue.m128i_value, signBits.m128i_value); x86Result.m128i_value = _mm_cmplt_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a < b? tmpaValue.m128i_value = _mm_cmpeq_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a == b? x86Result.m128i_value = _mm_or_si128(x86Result.m128i_value, tmpaValue.m128i_value); // result = (a<b)|(a==b) return X86SIMDValue::ToSIMDValue(x86Result); }
// input/output is uint8_t static WEBP_INLINE void GetNotHEV(const __m128i* const p1, const __m128i* const p0, const __m128i* const q0, const __m128i* const q1, int hev_thresh, __m128i* const not_hev) { const __m128i zero = _mm_setzero_si128(); const __m128i t_1 = MM_ABS(*p1, *p0); const __m128i t_2 = MM_ABS(*q1, *q0); const __m128i h = _mm_set1_epi8(hev_thresh); const __m128i t_3 = _mm_subs_epu8(t_1, h); // abs(p1 - p0) - hev_tresh const __m128i t_4 = _mm_subs_epu8(t_2, h); // abs(q1 - q0) - hev_tresh *not_hev = _mm_or_si128(t_3, t_4); *not_hev = _mm_cmpeq_epi8(*not_hev, zero); // not_hev <= t1 && not_hev <= t2 }
int main() { int8_t m[16] = {45, 37, 35, 45, 37, 35, 45, 37, 35, 45, 37, 35, 0, 0, 0, 0}; int8_t s[16] = {-4, -4, -4, -4, 1, 1, -4, -4, -4, -4, 1, 1, -4,-4, -4, -4}; __m128i M = _mm_load_si128((__m128i*)m); __m128i S = _mm_load_si128((__m128i*)s); __m128i zero = _mm_set1_epi32(0); //__m128i sum = _mm_mask_add_epi8(zero, _mm_cmpneq_epi8_mask(M, zero), S,S); __m128i sum = _mm_andnot_si128(_mm_cmpeq_epi8(M, zero), _mm_add_epi8(M, S)); print128(M); print128(S); print128(sum); return 0; }
// input pixels are uint8_t static WEBP_INLINE void NeedsFilter(const __m128i* const p1, const __m128i* const p0, const __m128i* const q0, const __m128i* const q1, int thresh, __m128i* const mask) { const __m128i m_thresh = _mm_set1_epi8(thresh); const __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1) const __m128i kFE = _mm_set1_epi8(0xFE); const __m128i t2 = _mm_and_si128(t1, kFE); // set lsb of each byte to zero const __m128i t3 = _mm_srli_epi16(t2, 1); // abs(p1 - q1) / 2 const __m128i t4 = MM_ABS(*p0, *q0); // abs(p0 - q0) const __m128i t5 = _mm_adds_epu8(t4, t4); // abs(p0 - q0) * 2 const __m128i t6 = _mm_adds_epu8(t5, t3); // abs(p0-q0)*2 + abs(p1-q1)/2 const __m128i t7 = _mm_subs_epu8(t6, m_thresh); // mask <= m_thresh *mask = _mm_cmpeq_epi8(t7, _mm_setzero_si128()); }
int countZeroBytes_SSE(char* values, int length) { int zeroCount = 0; __m128i zero16 = _mm_set1_epi8(0); __m128i and16 = _mm_set1_epi8(1); for(int i=0; i<length; i+=16) { __m128i values16 = _mm_loadu_si128((__m128i*)&values[i]); __m128i cmp = _mm_cmpeq_epi8(values16, zero16); if(_mm_movemask_epi8(cmp)) { cmp = _mm_and_si128(and16, cmp); //change -1 values to 1 //hortiontal sum of 16 bytes __m128i sum1 = _mm_sad_epu8(cmp,zero16); __m128i sum2 = _mm_shuffle_epi32(sum1,2); __m128i sum3 = _mm_add_epi16(sum1,sum2); zeroCount += _mm_cvtsi128_si32(sum3); } } return zeroCount; }
static int ExtractAlpha(const uint8_t* argb, int argb_stride, int width, int height, uint8_t* alpha, int alpha_stride) { // alpha_and stores an 'and' operation of all the alpha[] values. The final // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; int i, j; const __m128i a_mask = _mm_set1_epi32(0xffu); // to preserve alpha const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); __m128i all_alphas = all_0xff; // We must be able to access 3 extra bytes after the last written byte // 'src[4 * width - 4]', because we don't know if alpha is the first or the // last byte of the quadruplet. const int limit = (width - 1) & ~7; for (j = 0; j < height; ++j) { const __m128i* src = (const __m128i*)argb; for (i = 0; i < limit; i += 8) { // load 32 argb bytes const __m128i a0 = _mm_loadu_si128(src + 0); const __m128i a1 = _mm_loadu_si128(src + 1); const __m128i b0 = _mm_and_si128(a0, a_mask); const __m128i b1 = _mm_and_si128(a1, a_mask); const __m128i c0 = _mm_packs_epi32(b0, b1); const __m128i d0 = _mm_packus_epi16(c0, c0); // store _mm_storel_epi64((__m128i*)&alpha[i], d0); // accumulate eight alpha 'and' in parallel all_alphas = _mm_and_si128(all_alphas, d0); src += 2; } for (; i < width; ++i) { const uint32_t alpha_value = argb[4 * i]; alpha[i] = alpha_value; alpha_and &= alpha_value; } argb += argb_stride; alpha += alpha_stride; } // Combine the eight alpha 'and' into a 8-bit mask. alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); return (alpha_and == 0xff); }