static int HafCpu_Histogram3Thresholds_DATA_U8 ( vx_uint32 dstHist[], vx_uint8 distThreshold0, vx_uint8 distThreshold1, vx_uint8 distThreshold2, vx_uint32 srcWidth, vx_uint32 srcHeight, vx_uint8 * pSrcImage, vx_uint32 srcImageStrideInBytes ) { // offset: to convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes // thresh: source threshold in -128..127 range __m128i offset = _mm_set1_epi8((char)0x80); __m128i T0 = _mm_set1_epi8((char)((distThreshold0 - 1) ^ 0x80)); __m128i T1 = _mm_set1_epi8((char)((distThreshold1 - 1) ^ 0x80)); __m128i T2 = _mm_set1_epi8((char)((distThreshold2 - 1) ^ 0x80)); __m128i onemask = _mm_set1_epi8((char)1); // process one pixel row at a time that counts "pixel < srcThreshold" __m128i count0 = _mm_set1_epi8((char)0); __m128i count1 = _mm_set1_epi8((char)0); __m128i count2 = _mm_set1_epi8((char)0); vx_uint8 * srcRow = pSrcImage; vx_uint32 width = (srcWidth + 15) >> 4; for (unsigned int y = 0; y < srcHeight; y++) { __m128i * src = (__m128i *)srcRow; for (unsigned int x = 0; x < width; x++) { __m128i pixels = _mm_load_si128(src++); pixels = _mm_xor_si128(pixels, offset); __m128i cmpout; cmpout = _mm_cmpgt_epi8(pixels, T0); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count0 = _mm_add_epi32(count0, cmpout); cmpout = _mm_cmpgt_epi8(pixels, T1); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count1 = _mm_add_epi32(count1, cmpout); cmpout = _mm_cmpgt_epi8(pixels, T2); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count2 = _mm_add_epi32(count2, cmpout); } srcRow += srcImageStrideInBytes; } // extract histogram from count: special case needed when T1 == T2 dstHist[0] = M128I(count0).m128i_u32[0] + M128I(count0).m128i_u32[2]; dstHist[1] = M128I(count1).m128i_u32[0] + M128I(count1).m128i_u32[2] - dstHist[0]; dstHist[2] = M128I(count2).m128i_u32[0] + M128I(count2).m128i_u32[2] - dstHist[0] - dstHist[1]; dstHist[3] = srcWidth * srcHeight - dstHist[0] - dstHist[1] - dstHist[2]; if (M128I(T1).m128i_i8[0] == M128I(T2).m128i_i8[0]) { dstHist[2] = dstHist[3]; dstHist[3] = 0; } return AGO_SUCCESS; }
// Denoise a 16x1 vector. static INLINE __m128i vp9_denoiser_16x1_sse2( const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, const __m128i *k_0, const __m128i *k_4, const __m128i *k_8, const __m128i *k_16, const __m128i *l3, const __m128i *l32, const __m128i *l21, __m128i acc_diff) { // Calculate differences const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); __m128i v_running_avg_y; const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); // Obtain the sign. FF if diff is negative. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0); // Clamp absolute difference to 16 to be used to get mask. Doing this // allows us to use _mm_cmpgt_epi8, which operates on signed byte. const __m128i clamped_absdiff = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16); // Get masks for l2 l1 and l0 adjustments. const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff); const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff); const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff); // Get adjustments for l2, l1, and l0. __m128i adj2 = _mm_and_si128(mask2, *l32); const __m128i adj1 = _mm_and_si128(mask1, *l21); const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); __m128i adj, padj, nadj; // Combine the adjustments and get absolute adjustments. adj2 = _mm_add_epi8(adj2, adj1); adj = _mm_sub_epi8(*l3, adj2); adj = _mm_andnot_si128(mask0, adj); adj = _mm_or_si128(adj, adj0); // Restore the sign and get positive and negative adjustments. padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); // Calculate filtered value. v_running_avg_y = _mm_adds_epu8(v_sig, padj); v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); // Adjustments <=7, and each element in acc_diff can fit in signed // char. acc_diff = _mm_adds_epi8(acc_diff, padj); acc_diff = _mm_subs_epi8(acc_diff, nadj); return acc_diff; }
inline void casefoldRange(char* dest, const char* begin, const char* end) { if (end - begin < 64) { // short string, don't bother optimizing for (const char* i = begin; i != end; ++i) *dest++ = casefold(*i); } else { // Shift 'A'..'Z' range ([65..90]) to [102..127] to use one signed comparison insn __m128i shiftAmount = _mm_set1_epi8(127 - 'Z'); __m128i lowerBound = _mm_set1_epi8(127 - ('Z' - 'A') - 1); __m128i upperBit = _mm_set1_epi8(0x20); const char* i = begin; for (; i + 16 < end; i += 16) { __m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(i)); __m128i upperMask = _mm_cmpgt_epi8(_mm_add_epi8(v, shiftAmount), lowerBound); __m128i cfv = _mm_or_si128(v, _mm_and_si128(upperMask, upperBit)); _mm_storeu_si128(reinterpret_cast<__m128i*>(dest), cfv); dest += 16; } for (; i != end; ++i) *dest++ = casefold(*i); } }
__m128i test_mm_cmpgt_epi8(__m128i A, __m128i B) { // DAG-LABEL: test_mm_cmpgt_epi8 // DAG: icmp sgt <16 x i8> // // ASM-LABEL: test_mm_cmpgt_epi8 // ASM: pcmpgtb return _mm_cmpgt_epi8(A, B); }
// Shift each byte of "x" by 3 bits while preserving by the sign bit. static WEBP_INLINE void SignedShift8b(__m128i* const x) { const __m128i zero = _mm_setzero_si128(); const __m128i signs = _mm_cmpgt_epi8(zero, *x); const __m128i lo_0 = _mm_unpacklo_epi8(*x, signs); // s8 -> s16 sign extend const __m128i hi_0 = _mm_unpackhi_epi8(*x, signs); const __m128i lo_1 = _mm_srai_epi16(lo_0, 3); const __m128i hi_1 = _mm_srai_epi16(hi_0, 3); *x = _mm_packs_epi16(lo_1, hi_1); }
SIMDValue SIMDInt8x16Operation::OpGreaterThan(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_cmpgt_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a > b? return X86SIMDValue::ToSIMDValue(x86Result); }
mlib_status __mlib_VectorConvert_U8_S8_Sat( mlib_u8 *z, const mlib_s8 *x, mlib_s32 n) { if (n < 1) return (MLIB_FAILURE); mlib_s32 i, ax, az, nstep, n1, n2, n3, xval; mlib_s8 *px = (mlib_s8 *)x; mlib_u8 *pz = (mlib_u8 *)z; __m128i zbuf, xbuf, zero, mask; zero = _mm_setzero_si128(); ax = (mlib_addr)x & 15; az = (mlib_addr)z & 15; nstep = 16 / sizeof (mlib_u8); n1 = ((16 - ax) & 15) / sizeof (mlib_u8); n2 = (n - n1) / nstep; n3 = n - n1 - n2 * nstep; if (n2 < 1) { for (i = 0; i < n; i++) { xval = *px++; if (xval < 0) xval = 0; *pz++ = xval; } } else { for (i = 0; i < n1; i++) { xval = *px++; if (xval < 0) xval = 0; *pz++ = xval; } for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); mask = _mm_cmpgt_epi8(zero, xbuf); zbuf = _mm_andnot_si128(mask, xbuf); _mm_storeu_si128((__m128i *)pz, zbuf); px += nstep; pz += nstep; } for (i = 0; i < n3; i++) { xval = *px++; if (xval < 0) xval = 0; *pz++ = xval; } } return (MLIB_SUCCESS); }
__m64 _m_pcmpgtb(__m64 _MM1, __m64 _MM2) { __m128i lhs = {0}, rhs = {0}; lhs.m128i_i64[0] = _MM1.m64_i64; rhs.m128i_i64[0] = _MM2.m64_i64; lhs = _mm_cmpgt_epi8(lhs, rhs); _MM1.m64_i64 = lhs.m128i_i64[0]; return _MM1; }
// The function assumes that the image pointers are 16 byte aligned, and the source and destination strides as well // It processes the pixels in a width which is the next highest multiple of 16 after dstWidth static int HafCpu_Histogram1Threshold_DATA_U8 ( vx_uint32 dstHist[], vx_uint8 distThreshold, vx_uint32 srcWidth, vx_uint32 srcHeight, vx_uint8 * pSrcImage, vx_uint32 srcImageStrideInBytes ) { // offset: to convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes // thresh: source threshold in -128..127 range __m128i offset = _mm_set1_epi8((char)0x80); __m128i thresh = _mm_set1_epi8((char)((distThreshold - 1) ^ 0x80)); __m128i onemask = _mm_set1_epi8((char)1); // process one pixel row at a time that counts "pixel < srcThreshold" __m128i count = _mm_set1_epi8((char)0); vx_uint8 * srcRow = pSrcImage; vx_uint32 width = (srcWidth + 15) >> 4; for (unsigned int y = 0; y < srcHeight; y++) { __m128i * src = (__m128i *)srcRow; for (unsigned int x = 0; x < width; x++) { __m128i pixels = _mm_load_si128(src++); pixels = _mm_xor_si128(pixels, offset); pixels = _mm_cmpgt_epi8(pixels, thresh); pixels = _mm_and_si128(pixels, onemask); pixels = _mm_sad_epu8(pixels, onemask); count = _mm_add_epi32(count, pixels); } srcRow += srcImageStrideInBytes; } // extract histogram from count dstHist[0] = M128I(count).m128i_u32[0] + M128I(count).m128i_u32[2]; dstHist[1] = srcWidth * srcHeight - dstHist[0]; return AGO_SUCCESS; }
static inline __m128i _mm_min_epi8_rpl(__m128i a, __m128i b) { __m128i mask = _mm_cmpgt_epi8(b, a); a = _mm_and_si128(a, mask); b = _mm_andnot_si128(mask, b); return _mm_or_si128(a, b); }
ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result_size_hint) const { size_t col_size = size(); if (col_size != filt.size()) throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); auto res = ColumnFixedString::create(n); if (result_size_hint) res->chars.reserve(result_size_hint > 0 ? result_size_hint * n : chars.size()); const UInt8 * filt_pos = &filt[0]; const UInt8 * filt_end = filt_pos + col_size; const UInt8 * data_pos = &chars[0]; #if __SSE2__ /** A slightly more optimized version. * Based on the assumption that often pieces of consecutive values * completely pass or do not pass the filter. * Therefore, we will optimistically check the parts of `SIMD_BYTES` values. */ static constexpr size_t SIMD_BYTES = 16; const __m128i zero16 = _mm_setzero_si128(); const UInt8 * filt_end_sse = filt_pos + col_size / SIMD_BYTES * SIMD_BYTES; const size_t chars_per_simd_elements = SIMD_BYTES * n; while (filt_pos < filt_end_sse) { int mask = _mm_movemask_epi8(_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(filt_pos)), zero16)); if (0 == mask) { /// Nothing is inserted. data_pos += chars_per_simd_elements; } else if (0xFFFF == mask) { res->chars.insert(data_pos, data_pos + chars_per_simd_elements); data_pos += chars_per_simd_elements; } else { size_t res_chars_size = res->chars.size(); for (size_t i = 0; i < SIMD_BYTES; ++i) { if (filt_pos[i]) { res->chars.resize(res_chars_size + n); memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos, n); res_chars_size += n; } data_pos += n; } } filt_pos += SIMD_BYTES; } #endif size_t res_chars_size = res->chars.size(); while (filt_pos < filt_end) { if (*filt_pos) { res->chars.resize(res_chars_size + n); memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos, n); res_chars_size += n; } ++filt_pos; data_pos += n; } return std::move(res); }
mlib_status __mlib_VectorSumAbsDiff_S8_Sat( mlib_d64 *z, const mlib_s8 *x, const mlib_s8 *y, mlib_s32 n) { if (n <= 0) return (MLIB_FAILURE); mlib_s32 i, nstep, ax, ay, n1, n2, n3, diff, sum = 0; mlib_s8 *px = (mlib_s8 *)x, *py = (mlib_s8 *)y; __m128i zero, xbuf, ybuf, zbuf, mext, mbuf; zero = _mm_setzero_si128(); zbuf = zero; nstep = 16 / sizeof (mlib_s8); ax = (mlib_addr)x & 15; ay = (mlib_addr)y & 15; n1 = ((16 - ax) & 15) / sizeof (mlib_s8); n2 = (n - n1) / nstep; n3 = n - n1 - n2 * nstep; if (n2 < 1) { for (i = 0; i < n; i++) { diff = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(diff); } *z = sum; } else { for (i = 0; i < n1; i++) { diff = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(diff); } if (ax == ay) { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_load_si128((__m128i *)py); mext = _mm_cmpgt_epi8(ybuf, xbuf); mbuf = _mm_sub_epi8(xbuf, ybuf); mbuf = _mm_xor_si128(mbuf, mext); mbuf = _mm_sub_epi8(mbuf, mext); mbuf = _mm_sad_epu8(mbuf, zero); zbuf = _mm_add_epi64(zbuf, mbuf); px += nstep; py += nstep; } } else { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_loadu_si128((__m128i *)py); mext = _mm_cmpgt_epi8(ybuf, xbuf); mbuf = _mm_sub_epi8(xbuf, ybuf); mbuf = _mm_xor_si128(mbuf, mext); mbuf = _mm_sub_epi8(mbuf, mext); mbuf = _mm_sad_epu8(mbuf, zero); zbuf = _mm_add_epi64(zbuf, mbuf); px += nstep; py += nstep; } } for (i = 0; i < n3; i++) { diff = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(diff); } mlib_d64 dsum = sum; long long pz[2]; _mm_storeu_si128((__m128i *)pz, zbuf); dsum += pz[0]; dsum += pz[1]; *z = dsum; } return (MLIB_SUCCESS); }
int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising) { unsigned char *running_avg_y_start = running_avg_y; unsigned char *sig_start = sig; unsigned int sum_diff_thresh; int r; int shift_inc = (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; __m128i acc_diff = _mm_setzero_si128(); const __m128i k_0 = _mm_setzero_si128(); const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); const __m128i k_8 = _mm_set1_epi8(8); const __m128i k_16 = _mm_set1_epi8(16); /* Modify each level's adjustment according to motion_magnitude. */ const __m128i l3 = _mm_set1_epi8( (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); /* Difference between level 3 and level 2 is 2. */ const __m128i l32 = _mm_set1_epi8(2); /* Difference between level 2 and level 1 is 1. */ const __m128i l21 = _mm_set1_epi8(1); for (r = 0; r < 16; ++r) { /* Calculate differences */ const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0])); __m128i v_running_avg_y; const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); /* Obtain the sign. FF if diff is negative. */ const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); /* Clamp absolute difference to 16 to be used to get mask. Doing this * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */ const __m128i clamped_absdiff = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16); /* Get masks for l2 l1 and l0 adjustments */ const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); /* Get adjustments for l2, l1, and l0 */ __m128i adj2 = _mm_and_si128(mask2, l32); const __m128i adj1 = _mm_and_si128(mask1, l21); const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); __m128i adj, padj, nadj; /* Combine the adjustments and get absolute adjustments. */ adj2 = _mm_add_epi8(adj2, adj1); adj = _mm_sub_epi8(l3, adj2); adj = _mm_andnot_si128(mask0, adj); adj = _mm_or_si128(adj, adj0); /* Restore the sign and get positive and negative adjustments. */ padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); /* Calculate filtered value. */ v_running_avg_y = _mm_adds_epu8(v_sig, padj); v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); /* Adjustments <=7, and each element in acc_diff can fit in signed * char. */ acc_diff = _mm_adds_epi8(acc_diff, padj); acc_diff = _mm_subs_epi8(acc_diff, nadj); /* Update pointers for next iteration. */ sig += sig_stride; mc_running_avg_y += mc_avg_y_stride; running_avg_y += avg_y_stride; } { /* Compute the sum of all pixel differences of this MB. */ unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); sum_diff_thresh = SUM_DIFF_THRESHOLD; if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; if (abs_sum_diff > sum_diff_thresh) { // Before returning to copy the block (i.e., apply no denoising), // check if we can still apply some (weaker) temporal filtering to // this block, that would otherwise not be denoised at all. Simplest // is to apply an additional adjustment to running_avg_y to bring it // closer to sig. The adjustment is capped by a maximum delta, and // chosen such that in most cases the resulting sum_diff will be // within the acceptable range given by sum_diff_thresh. // The delta is set by the excess of absolute pixel diff over the // threshold. int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; // Only apply the adjustment for max delta up to 3. if (delta < 4) { const __m128i k_delta = _mm_set1_epi8(delta); sig -= sig_stride * 16; mc_running_avg_y -= mc_avg_y_stride * 16; running_avg_y -= avg_y_stride * 16; for (r = 0; r < 16; ++r) { __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0])); // Calculate differences. const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0])); const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); // Obtain the sign. FF if diff is negative. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); // Clamp absolute difference to delta to get the adjustment. const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); // Restore the sign and get positive and negative adjustments. __m128i padj, nadj; padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); // Calculate filtered value. v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj); v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); // Accumulate the adjustments. acc_diff = _mm_subs_epi8(acc_diff, padj); acc_diff = _mm_adds_epi8(acc_diff, nadj); // Update pointers for next iteration. sig += sig_stride; mc_running_avg_y += mc_avg_y_stride; running_avg_y += avg_y_stride; } abs_sum_diff = abs_sum_diff_16x1(acc_diff); if (abs_sum_diff > sum_diff_thresh) { return COPY_BLOCK; } } else { return COPY_BLOCK; } } }
// count genotype sum and number of calls, not requiring 16-aligned p COREARRAY_DLL_DEFAULT C_UInt8* vec_u8_geno_count(C_UInt8 *p, size_t n, C_Int32 &out_sum, C_Int32 &out_num) { C_Int32 sum=0, num=0; #if defined(COREARRAY_SIMD_AVX2) const __m256i three = _mm256_set1_epi8(3); const __m256i zero = _mm256_setzero_si256(); __m256i sum32 = zero, num32 = zero; size_t limit_by_U8 = 0; for (; n >= 32; ) { __m256i v = _mm256_loadu_si256((__m256i const*)p); p += 32; __m256i m = _mm256_cmpgt_epi8(three, _mm256_min_epu8(v, three)); sum32 = _mm256_add_epi8(sum32, _mm256_and_si256(v, m)); num32 = _mm256_sub_epi8(num32, m); n -= 32; limit_by_U8 ++; if ((limit_by_U8 >= 127) || (n < 32)) { // add to sum sum32 = _mm256_sad_epu8(sum32, zero); sum32 = _mm256_add_epi32(sum32, _mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(1,0,3,2))); sum32 = _mm256_add_epi32(sum32, _mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(0,0,0,1))); sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(sum32)); // add to num num32 = _mm256_sad_epu8(num32, zero); num32 = _mm256_add_epi32(num32, _mm256_permute4x64_epi64(num32, _MM_SHUFFLE(1,0,3,2))); num32 = _mm256_add_epi32(num32, _mm256_permute4x64_epi64(num32, _MM_SHUFFLE(0,0,0,1))); num += _mm_cvtsi128_si32(_mm256_castsi256_si128(num32)); // reset sum32 = num32 = zero; limit_by_U8 = 0; } } #elif defined(COREARRAY_SIMD_SSE2) // header, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p++) if (*p <= 2) { sum += *p; num++; } const __m128i three = _mm_set1_epi8(3); const __m128i zero = _mm_setzero_si128(); __m128i sum16=zero, num16=zero; size_t limit_by_U8 = 0; for (; n >= 16; ) { __m128i v = _mm_load_si128((__m128i const*)p); p += 16; __m128i m = _mm_cmpgt_epi8(three, _mm_min_epu8(v, three)); sum16 = _mm_add_epi8(sum16, v & m); num16 = _mm_sub_epi8(num16, m); n -= 16; limit_by_U8 ++; if ((limit_by_U8 >= 127) || (n < 16)) { // add to sum sum16 = _mm_sad_epu8(sum16, zero); sum += _mm_cvtsi128_si32(sum16); sum += _mm_cvtsi128_si32(_mm_shuffle_epi32(sum16, 2)); // add to num num16 = _mm_sad_epu8(num16, zero); num += _mm_cvtsi128_si32(num16); num += _mm_cvtsi128_si32(_mm_shuffle_epi32(num16, 2)); // reset sum16 = num16 = zero; limit_by_U8 = 0; } } #endif for (; n > 0; n--, p++) if (*p <= 2) { sum += *p; num++; } out_sum = sum; out_num = num; return p; }
void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression) { Mat img = _img.getMat(); const int K = patternSize/2, N = patternSize + K + 1; #if CV_SSE2 const int quarterPatternSize = patternSize/4; (void)quarterPatternSize; #endif int i, j, k, pixel[25]; makeOffsets(pixel, (int)img.step, patternSize); keypoints.clear(); threshold = std::min(std::max(threshold, 0), 255); #if CV_SSE2 __m128i delta = _mm_set1_epi8(-128), t = _mm_set1_epi8((char)threshold), K16 = _mm_set1_epi8((char)K); (void)K16; (void)delta; (void)t; #endif uchar threshold_tab[512]; for( i = -255; i <= 255; i++ ) threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0); AutoBuffer<uchar> _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128); uchar* buf[3]; buf[0] = _buf; buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols; int* cpbuf[3]; cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1; cpbuf[1] = cpbuf[0] + img.cols + 1; cpbuf[2] = cpbuf[1] + img.cols + 1; memset(buf[0], 0, img.cols*3); for(i = 3; i < img.rows-2; i++) { const uchar* ptr = img.ptr<uchar>(i) + 3; uchar* curr = buf[(i - 3)%3]; int* cornerpos = cpbuf[(i - 3)%3]; memset(curr, 0, img.cols); int ncorners = 0; if( i < img.rows - 3 ) { j = 3; #if CV_SSE2 if( patternSize == 16 ) { for(; j < img.cols - 16 - 3; j += 16, ptr += 16) { __m128i m0, m1; __m128i v0 = _mm_loadu_si128((const __m128i*)ptr); __m128i v1 = _mm_xor_si128(_mm_subs_epu8(v0, t), delta); v0 = _mm_xor_si128(_mm_adds_epu8(v0, t), delta); __m128i x0 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[0])), delta); __m128i x1 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[quarterPatternSize])), delta); __m128i x2 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[2*quarterPatternSize])), delta); __m128i x3 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[3*quarterPatternSize])), delta); m0 = _mm_and_si128(_mm_cmpgt_epi8(x0, v0), _mm_cmpgt_epi8(x1, v0)); m1 = _mm_and_si128(_mm_cmpgt_epi8(v1, x0), _mm_cmpgt_epi8(v1, x1)); m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x1, v0), _mm_cmpgt_epi8(x2, v0))); m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x1), _mm_cmpgt_epi8(v1, x2))); m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x2, v0), _mm_cmpgt_epi8(x3, v0))); m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x2), _mm_cmpgt_epi8(v1, x3))); m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x3, v0), _mm_cmpgt_epi8(x0, v0))); m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x3), _mm_cmpgt_epi8(v1, x0))); m0 = _mm_or_si128(m0, m1); int mask = _mm_movemask_epi8(m0); if( mask == 0 ) continue; if( (mask & 255) == 0 ) { j -= 8; ptr -= 8; continue; } __m128i c0 = _mm_setzero_si128(), c1 = c0, max0 = c0, max1 = c0; for( k = 0; k < N; k++ ) { __m128i x = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(ptr + pixel[k])), delta); m0 = _mm_cmpgt_epi8(x, v0); m1 = _mm_cmpgt_epi8(v1, x); c0 = _mm_and_si128(_mm_sub_epi8(c0, m0), m0); c1 = _mm_and_si128(_mm_sub_epi8(c1, m1), m1); max0 = _mm_max_epu8(max0, c0); max1 = _mm_max_epu8(max1, c1); } max0 = _mm_max_epu8(max0, max1); int m = _mm_movemask_epi8(_mm_cmpgt_epi8(max0, K16)); for( k = 0; m > 0 && k < 16; k++, m >>= 1 ) if(m & 1) { cornerpos[ncorners++] = j+k; if(nonmax_suppression) curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold); } } } #endif for( ; j < img.cols - 3; j++, ptr++ ) { int v = ptr[0]; const uchar* tab = &threshold_tab[0] - v + 255; int d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]]; if( d == 0 ) continue; d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]]; d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]]; d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]]; if( d == 0 ) continue; d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]]; d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]]; d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]]; d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]]; if( d & 1 ) { int vt = v - threshold, count = 0; for( k = 0; k < N; k++ ) { int x = ptr[pixel[k]]; if(x < vt) { if( ++count > K ) { cornerpos[ncorners++] = j; if(nonmax_suppression) curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold); break; } } else count = 0; } } if( d & 2 ) { int vt = v + threshold, count = 0; for( k = 0; k < N; k++ ) { int x = ptr[pixel[k]]; if(x > vt) { if( ++count > K ) { cornerpos[ncorners++] = j; if(nonmax_suppression) curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold); break; } } else count = 0; } } } } cornerpos[-1] = ncorners; if( i == 3 ) continue; const uchar* prev = buf[(i - 4 + 3)%3]; const uchar* pprev = buf[(i - 5 + 3)%3]; cornerpos = cpbuf[(i - 4 + 3)%3]; ncorners = cornerpos[-1]; for( k = 0; k < ncorners; k++ ) { j = cornerpos[k]; int score = prev[j]; if( !nonmax_suppression || (score > prev[j+1] && score > prev[j-1] && score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] && score > curr[j-1] && score > curr[j] && score > curr[j+1]) ) { keypoints.push_back(KeyPoint((float)j, (float)(i-1), 7.f, -1, (float)score)); } } }
ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_size_hint) const { size_t size = data.size(); if (size != filt.size()) throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); auto res = this->create(); Container & res_data = res->getData(); if (result_size_hint) res_data.reserve(result_size_hint > 0 ? result_size_hint : size); const UInt8 * filt_pos = &filt[0]; const UInt8 * filt_end = filt_pos + size; const T * data_pos = &data[0]; #if __SSE2__ /** A slightly more optimized version. * Based on the assumption that often pieces of consecutive values * completely pass or do not pass the filter. * Therefore, we will optimistically check the parts of `SIMD_BYTES` values. */ static constexpr size_t SIMD_BYTES = 16; const __m128i zero16 = _mm_setzero_si128(); const UInt8 * filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES; while (filt_pos < filt_end_sse) { int mask = _mm_movemask_epi8(_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(filt_pos)), zero16)); if (0 == mask) { /// Nothing is inserted. } else if (0xFFFF == mask) { res_data.insert(data_pos, data_pos + SIMD_BYTES); } else { for (size_t i = 0; i < SIMD_BYTES; ++i) if (filt_pos[i]) res_data.push_back(data_pos[i]); } filt_pos += SIMD_BYTES; data_pos += SIMD_BYTES; } #endif while (filt_pos < filt_end) { if (*filt_pos) res_data.push_back(*data_pos); ++filt_pos; ++data_pos; } return std::move(res); }
/// Element-wise comparison for greater than. inline xmm_i8 operator > (const xmm_i8 &a, const xmm_i8 &b) { return _mm_cmpgt_epi8(a, b); }
test (__m128i s1, __m128i s2) { return _mm_cmpgt_epi8 (s1, s2); }
__m128i test_mm_cmpgt_epi8(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_cmpgt_epi8 // CHECK: icmp sgt <16 x i8> return _mm_cmpgt_epi8(A, B); }
int vp8_denoiser_filter_sse2(YV12_BUFFER_CONFIG *mc_running_avg, YV12_BUFFER_CONFIG *running_avg, MACROBLOCK *signal, unsigned int motion_magnitude, int y_offset, int uv_offset) { unsigned char *sig = signal->thismb; int sig_stride = 16; unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset; int mc_avg_y_stride = mc_running_avg->y_stride; unsigned char *running_avg_y = running_avg->y_buffer + y_offset; int avg_y_stride = running_avg->y_stride; int r; (void)uv_offset; __m128i acc_diff = _mm_setzero_si128(); const __m128i k_0 = _mm_setzero_si128(); const __m128i k_4 = _mm_set1_epi8(4); const __m128i k_8 = _mm_set1_epi8(8); const __m128i k_16 = _mm_set1_epi8(16); /* Modify each level's adjustment according to motion_magnitude. */ const __m128i l3 = _mm_set1_epi8( (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 : 6); /* Difference between level 3 and level 2 is 2. */ const __m128i l32 = _mm_set1_epi8(2); /* Difference between level 2 and level 1 is 1. */ const __m128i l21 = _mm_set1_epi8(1); for (r = 0; r < 16; ++r) { /* Calculate differences */ const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128( (__m128i *)(&mc_running_avg_y[0])); __m128i v_running_avg_y; const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); /* Obtain the sign. FF if diff is negative. */ const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); /* Clamp absolute difference to 16 to be used to get mask. Doing this * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */ const __m128i clamped_absdiff = _mm_min_epu8( _mm_or_si128(pdiff, ndiff), k_16); /* Get masks for l2 l1 and l0 adjustments */ const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); /* Get adjustments for l2, l1, and l0 */ __m128i adj2 = _mm_and_si128(mask2, l32); const __m128i adj1 = _mm_and_si128(mask1, l21); const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); __m128i adj, padj, nadj; /* Combine the adjustments and get absolute adjustments. */ adj2 = _mm_add_epi8(adj2, adj1); adj = _mm_sub_epi8(l3, adj2); adj = _mm_andnot_si128(mask0, adj); adj = _mm_or_si128(adj, adj0); /* Restore the sign and get positive and negative adjustments. */ padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); /* Calculate filtered value. */ v_running_avg_y = _mm_adds_epu8(v_sig, padj); v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); /* Adjustments <=7, and each element in acc_diff can fit in signed * char. */ acc_diff = _mm_adds_epi8(acc_diff, padj); acc_diff = _mm_subs_epi8(acc_diff, nadj); /* Update pointers for next iteration. */ sig += sig_stride; mc_running_avg_y += mc_avg_y_stride; running_avg_y += avg_y_stride; } { /* Compute the sum of all pixel differences of this MB. */ union sum_union s; int sum_diff = 0; s.v = acc_diff; sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5] + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11] + s.e[12] + s.e[13] + s.e[14] + s.e[15]; if (abs(sum_diff) > SUM_DIFF_THRESHOLD) { return COPY_BLOCK; } } vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride, signal->thismb, sig_stride); return FILTER_BLOCK; }
static int HafCpu_Histogram16Bins_DATA_U8 ( vx_uint32 * dstHist, vx_uint8 distOffset, vx_uint8 distWindow, vx_uint32 srcWidth, vx_uint32 srcHeight, vx_uint8 * pSrcImage, vx_uint32 srcImageStrideInBytes ) { // offset: to convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes // thresh: source threshold in -128..127 range __m128i offset = _mm_set1_epi8((char)0x80); __m128i T0 = _mm_set1_epi8((char)(((distOffset ? distOffset : distWindow) - 1) ^ 0x80)); __m128i dT = _mm_set1_epi8((char)distWindow); __m128i onemask = _mm_set1_epi8((char)1); // process one pixel row at a time that counts "pixel < srcThreshold" vx_uint32 count[16] = { 0 }; vx_uint8 * srcRow = pSrcImage; vx_uint32 width = (srcWidth + 15) >> 4; for (unsigned int y = 0; y < srcHeight; y++) { __m128i * src = (__m128i *)srcRow; __m128i count0 = _mm_set1_epi8((char)0); __m128i count1 = _mm_set1_epi8((char)0); __m128i count2 = _mm_set1_epi8((char)0); __m128i count3 = _mm_set1_epi8((char)0); for (unsigned int x = 0; x < width; x++) { __m128i pixels = _mm_load_si128(src++); pixels = _mm_xor_si128(pixels, offset); __m128i cmpout, Tnext = T0; // 0..3 cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count0 = _mm_add_epi32(count0, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 16); count0 = _mm_add_epi32(count0, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 32); count0 = _mm_add_epi32(count0, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 48); count0 = _mm_add_epi32(count0, cmpout); // 4..7 Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count1 = _mm_add_epi32(count1, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 16); count1 = _mm_add_epi32(count1, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 32); count1 = _mm_add_epi32(count1, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 48); count1 = _mm_add_epi32(count1, cmpout); // 8..11 Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count2 = _mm_add_epi32(count2, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 16); count2 = _mm_add_epi32(count2, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 32); count2 = _mm_add_epi32(count2, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 48); count2 = _mm_add_epi32(count2, cmpout); // 12..15 Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count3 = _mm_add_epi32(count3, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 16); count3 = _mm_add_epi32(count3, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 32); count3 = _mm_add_epi32(count3, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 48); count3 = _mm_add_epi32(count3, cmpout); } srcRow += srcImageStrideInBytes; // move counts from count0..2 into count[] for (int i = 0; i < 4; i++) { count[ 0 + i] += M128I(count0).m128i_u16[i] + M128I(count0).m128i_u16[4 + i]; count[ 4 + i] += M128I(count1).m128i_u16[i] + M128I(count1).m128i_u16[4 + i]; count[ 8 + i] += M128I(count2).m128i_u16[i] + M128I(count2).m128i_u16[4 + i]; count[12 + i] += M128I(count3).m128i_u16[i] + M128I(count3).m128i_u16[4 + i]; } } // extract histogram from count if (distOffset == 0) { vx_uint32 last = (distWindow >= 16) ? srcWidth * srcHeight : count[15]; for (int i = 14; i >= 0; i--) { count[i] = last - count[i]; last -= count[i]; } dstHist[0] = last; for (int i = 1; i < 16; i++) dstHist[i] = count[i - 1]; } else { vx_uint32 last = srcWidth * srcHeight; for (int i = 15; i >= 0; i--) { count[i] = last - count[i]; last -= count[i]; dstHist[i] = count[i]; } } return AGO_SUCCESS; }
}bool validate_utf8_sse(const char *src, size_t len) { const char *end = src + len; while (src + 16 < end) { __m128i chunk = _mm_loadu_si128((const __m128i *)(src)); int asciiMask = _mm_movemask_epi8(chunk); if (!asciiMask) { src += 16; continue; } __m128i chunk_signed = _mm_add_epi8(chunk, _mm_set1_epi8(0x80)); __m128i cond2 = _mm_cmplt_epi8(_mm_set1_epi8(0xc2 - 1 - 0x80), chunk_signed); __m128i state = _mm_set1_epi8((char)(0x0 | 0x80)); state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x2 | 0xc0)), cond2); __m128i cond3 = _mm_cmplt_epi8(_mm_set1_epi8(0xe0 - 1 - 0x80), chunk_signed); state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x3 | 0xe0)), cond3); __m128i mask3 = _mm_slli_si128(cond3, 1); __m128i cond4 = _mm_cmplt_epi8(_mm_set1_epi8(0xf0 - 1 - 0x80), chunk_signed); // Fall back to the scalar processing if (_mm_movemask_epi8(cond4)) { break; } __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7)); __m128i count_sub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1)); __m128i counts = _mm_add_epi8(count, _mm_slli_si128(count_sub1, 1)); __m128i shifts = count_sub1; shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1)); counts = _mm_add_epi8( counts, _mm_slli_si128(_mm_subs_epu8(counts, _mm_set1_epi8(0x2)), 2)); shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2)); if (asciiMask ^ _mm_movemask_epi8(_mm_cmpgt_epi8(counts, _mm_set1_epi8(0)))) return false; // error shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4)); if (_mm_movemask_epi8(_mm_cmpgt_epi8( _mm_sub_epi8(_mm_slli_si128(counts, 1), counts), _mm_set1_epi8(1)))) return false; // error shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8)); __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8)); shifts = _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1 chunk = _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1), _mm_srli_si128(_mm_slli_epi16(shifts, 7), 1)); __m128i chunk_right = _mm_slli_si128(chunk, 1); __m128i chunk_low = _mm_blendv_epi8( chunk, _mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6), _mm_set1_epi8(0xc0))), _mm_cmpeq_epi8(counts, _mm_set1_epi8(1))); __m128i chunk_high = _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2))); shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2), _mm_srli_si128(_mm_slli_epi16(shifts, 6), 2)); chunk_high = _mm_srli_epi32(chunk_high, 2); shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4), _mm_srli_si128(_mm_slli_epi16(shifts, 5), 4)); chunk_high = _mm_or_si128( chunk_high, _mm_and_si128(_mm_and_si128(_mm_slli_epi32(chunk_right, 4), _mm_set1_epi8(0xf0)), mask3)); int c = _mm_extract_epi16(counts, 7); int source_advance = !(c & 0x0200) ? 16 : !(c & 0x02) ? 15 : 14; __m128i high_bits = _mm_and_si128(chunk_high, _mm_set1_epi8(0xf8)); if (!_mm_testz_si128( mask3, _mm_or_si128(_mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0x00)), _mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0xd8))))) return false; shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 8), _mm_srli_si128(_mm_slli_epi16(shifts, 4), 8)); chunk_high = _mm_slli_si128(chunk_high, 1); __m128i shuf = _mm_add_epi8(shifts, _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); chunk_low = _mm_shuffle_epi8(chunk_low, shuf); chunk_high = _mm_shuffle_epi8(chunk_high, shuf); __m128i utf16_low = _mm_unpacklo_epi8(chunk_low, chunk_high); __m128i utf16_high = _mm_unpackhi_epi8(chunk_low, chunk_high); if (_mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_high, 8, _SIDD_UWORD_OPS | _SIDD_CMP_RANGES) | _mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_low, 8, _SIDD_UWORD_OPS | _SIDD_CMP_RANGES)) { return false; } src += source_advance; } return validate_utf8(src, end - src); }
static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); __m128i p7, p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; __m128i q5, q6, q7; __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; const __m128i thresh = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); const __m128i blimit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p))); p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p))); p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p))); q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p))); p4 = _mm256_castsi256_si128(p256_4); p3 = _mm256_castsi256_si128(p256_3); p2 = _mm256_castsi256_si128(p256_2); p1 = _mm256_castsi256_si128(p256_1); p0 = _mm256_castsi256_si128(p256_0); q0 = _mm256_castsi256_si128(q256_0); q1 = _mm256_castsi256_si128(q256_1); q2 = _mm256_castsi256_si128(q256_2); q3 = _mm256_castsi256_si128(q256_3); q4 = _mm256_castsi256_si128(q256_4); { const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); const __m128i fe = _mm_set1_epi8(0xfe); const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); __m128i work; flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(flat, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); mask = _mm_max_epu8(work, mask); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); mask = _mm_max_epu8(work, mask); mask = _mm_subs_epu8(mask, limit); mask = _mm_cmpeq_epi8(mask, zero); } // lp filter { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); const __m128i t80 = _mm_set1_epi8(0x80); const __m128i te0 = _mm_set1_epi8(0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); __m128i ps1 = _mm_xor_si128(p1, t80); __m128i ps0 = _mm_xor_si128(p0, t80); __m128i qs0 = _mm_xor_si128(q0, t80); __m128i qs1 = _mm_xor_si128(q1, t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2; filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); work_a = _mm_subs_epi8(qs0, ps0); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); /* Filter1 >> 3 */ work_a = _mm_cmpgt_epi8(zero, filter1); filter1 = _mm_srli_epi16(filter1, 3); work_a = _mm_and_si128(work_a, te0); filter1 = _mm_and_si128(filter1, t1f); filter1 = _mm_or_si128(filter1, work_a); qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); /* Filter2 >> 3 */ work_a = _mm_cmpgt_epi8(zero, filter2); filter2 = _mm_srli_epi16(filter2, 3); work_a = _mm_and_si128(work_a, te0); filter2 = _mm_and_si128(filter2, t1f); filter2 = _mm_or_si128(filter2, work_a); ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); /* filt >> 1 */ filt = _mm_adds_epi8(filter1, t1); work_a = _mm_cmpgt_epi8(zero, filt); filt = _mm_srli_epi16(filt, 1); work_a = _mm_and_si128(work_a, t80); filt = _mm_and_si128(filt, t7f); filt = _mm_or_si128(filt, work_a); filt = _mm_andnot_si128(hev, filt); ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); // loopfilter done { __m128i work; work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); flat = _mm_max_epu8(work, flat); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); flat = _mm_max_epu8(work, flat); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); p256_5 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s - 6 * p))); q256_5 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s + 5 * p))); p5 = _mm256_castsi256_si128(p256_5); q5 = _mm256_castsi256_si128(q256_5); flat2 = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); flat2 = _mm_max_epu8(work, flat2); p256_6 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s - 7 * p))); q256_6 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s + 6 * p))); p6 = _mm256_castsi256_si128(p256_6); q6 = _mm256_castsi256_si128(q256_6); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); flat2 = _mm_max_epu8(work, flat2); p256_7 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s - 8 * p))); q256_7 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s + 7 * p))); p7 = _mm256_castsi256_si128(p256_7); q7 = _mm256_castsi256_si128(q256_7); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_subs_epu8(flat2, one); flat2 = _mm_cmpeq_epi8(flat2, zero); flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // flat and wide flat calculations { const __m256i eight = _mm256_set1_epi16(8); const __m256i four = _mm256_set1_epi16(4); __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; const __m256i filter = _mm256_load_si256((__m256i const *)filt_loopfilter_avx2); p256_7 = _mm256_shuffle_epi8(p256_7, filter); p256_6 = _mm256_shuffle_epi8(p256_6, filter); p256_5 = _mm256_shuffle_epi8(p256_5, filter); p256_4 = _mm256_shuffle_epi8(p256_4, filter); p256_3 = _mm256_shuffle_epi8(p256_3, filter); p256_2 = _mm256_shuffle_epi8(p256_2, filter); p256_1 = _mm256_shuffle_epi8(p256_1, filter); p256_0 = _mm256_shuffle_epi8(p256_0, filter); q256_0 = _mm256_shuffle_epi8(q256_0, filter); q256_1 = _mm256_shuffle_epi8(q256_1, filter); q256_2 = _mm256_shuffle_epi8(q256_2, filter); q256_3 = _mm256_shuffle_epi8(q256_3, filter); q256_4 = _mm256_shuffle_epi8(q256_4, filter); q256_5 = _mm256_shuffle_epi8(q256_5, filter); q256_6 = _mm256_shuffle_epi8(q256_6, filter); q256_7 = _mm256_shuffle_epi8(q256_7, filter); pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), _mm256_add_epi16(p256_4, p256_3)); pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), _mm256_add_epi16(q256_4, q256_3)); pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1)); pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1)); pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); pixelFilter_p = _mm256_add_epi16( eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); pixetFilter_p2p1p0 = _mm256_add_epi16( four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); res_p = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4); flat2_p0 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4); flat2_q0 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); res_p = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, _mm256_add_epi16(p256_3, p256_0)), 3); flat_p0 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, _mm256_add_epi16(q256_3, q256_0)), 3); flat_q0 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); sum_p7 = _mm256_add_epi16(p256_7, p256_7); sum_q7 = _mm256_add_epi16(q256_7, q256_7); sum_p3 = _mm256_add_epi16(p256_3, p256_3); sum_q3 = _mm256_add_epi16(q256_3, q256_3); pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); res_p = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4); flat2_p1 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4); flat2_q1 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); res_p = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, _mm256_add_epi16(sum_p3, p256_1)), 3); flat_p1 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, _mm256_add_epi16(sum_q3, q256_1)), 3); flat_q1 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); sum_p7 = _mm256_add_epi16(sum_p7, p256_7); sum_q7 = _mm256_add_epi16(sum_q7, q256_7); sum_p3 = _mm256_add_epi16(sum_p3, p256_3); sum_q3 = _mm256_add_epi16(sum_q3, q256_3); pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); res_p = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4); flat2_p2 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4); flat2_q2 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); res_p = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, _mm256_add_epi16(sum_p3, p256_2)), 3); flat_p2 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, _mm256_add_epi16(sum_q3, q256_2)), 3); flat_q2 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); sum_p7 = _mm256_add_epi16(sum_p7, p256_7); sum_q7 = _mm256_add_epi16(sum_q7, q256_7); pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); res_p = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4); flat2_p3 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4); flat2_q3 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); sum_p7 = _mm256_add_epi16(sum_p7, p256_7); sum_q7 = _mm256_add_epi16(sum_q7, q256_7); pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); res_p = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4); flat2_p4 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4); flat2_q4 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); sum_p7 = _mm256_add_epi16(sum_p7, p256_7); sum_q7 = _mm256_add_epi16(sum_q7, q256_7); pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); res_p = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4); flat2_p5 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4); flat2_q5 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); sum_p7 = _mm256_add_epi16(sum_p7, p256_7); sum_q7 = _mm256_add_epi16(sum_q7, q256_7); pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); res_p = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4); flat2_p6 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); res_q = _mm256_srli_epi16( _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4); flat2_q6 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); } // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ p2 = _mm_andnot_si128(flat, p2); flat_p2 = _mm_and_si128(flat, flat_p2); p2 = _mm_or_si128(flat_p2, p2); p1 = _mm_andnot_si128(flat, ps1); flat_p1 = _mm_and_si128(flat, flat_p1); p1 = _mm_or_si128(flat_p1, p1); p0 = _mm_andnot_si128(flat, ps0); flat_p0 = _mm_and_si128(flat, flat_p0); p0 = _mm_or_si128(flat_p0, p0); q0 = _mm_andnot_si128(flat, qs0); flat_q0 = _mm_and_si128(flat, flat_q0); q0 = _mm_or_si128(flat_q0, q0); q1 = _mm_andnot_si128(flat, qs1); flat_q1 = _mm_and_si128(flat, flat_q1); q1 = _mm_or_si128(flat_q1, q1); q2 = _mm_andnot_si128(flat, q2); flat_q2 = _mm_and_si128(flat, flat_q2); q2 = _mm_or_si128(flat_q2, q2); p6 = _mm_andnot_si128(flat2, p6); flat2_p6 = _mm_and_si128(flat2, flat2_p6); p6 = _mm_or_si128(flat2_p6, p6); _mm_storeu_si128((__m128i *)(s - 7 * p), p6); p5 = _mm_andnot_si128(flat2, p5); flat2_p5 = _mm_and_si128(flat2, flat2_p5); p5 = _mm_or_si128(flat2_p5, p5); _mm_storeu_si128((__m128i *)(s - 6 * p), p5); p4 = _mm_andnot_si128(flat2, p4); flat2_p4 = _mm_and_si128(flat2, flat2_p4); p4 = _mm_or_si128(flat2_p4, p4); _mm_storeu_si128((__m128i *)(s - 5 * p), p4); p3 = _mm_andnot_si128(flat2, p3); flat2_p3 = _mm_and_si128(flat2, flat2_p3); p3 = _mm_or_si128(flat2_p3, p3); _mm_storeu_si128((__m128i *)(s - 4 * p), p3); p2 = _mm_andnot_si128(flat2, p2); flat2_p2 = _mm_and_si128(flat2, flat2_p2); p2 = _mm_or_si128(flat2_p2, p2); _mm_storeu_si128((__m128i *)(s - 3 * p), p2); p1 = _mm_andnot_si128(flat2, p1); flat2_p1 = _mm_and_si128(flat2, flat2_p1); p1 = _mm_or_si128(flat2_p1, p1); _mm_storeu_si128((__m128i *)(s - 2 * p), p1); p0 = _mm_andnot_si128(flat2, p0); flat2_p0 = _mm_and_si128(flat2, flat2_p0); p0 = _mm_or_si128(flat2_p0, p0); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); q0 = _mm_andnot_si128(flat2, q0); flat2_q0 = _mm_and_si128(flat2, flat2_q0); q0 = _mm_or_si128(flat2_q0, q0); _mm_storeu_si128((__m128i *)(s - 0 * p), q0); q1 = _mm_andnot_si128(flat2, q1); flat2_q1 = _mm_and_si128(flat2, flat2_q1); q1 = _mm_or_si128(flat2_q1, q1); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); q2 = _mm_andnot_si128(flat2, q2); flat2_q2 = _mm_and_si128(flat2, flat2_q2); q2 = _mm_or_si128(flat2_q2, q2); _mm_storeu_si128((__m128i *)(s + 2 * p), q2); q3 = _mm_andnot_si128(flat2, q3); flat2_q3 = _mm_and_si128(flat2, flat2_q3); q3 = _mm_or_si128(flat2_q3, q3); _mm_storeu_si128((__m128i *)(s + 3 * p), q3); q4 = _mm_andnot_si128(flat2, q4); flat2_q4 = _mm_and_si128(flat2, flat2_q4); q4 = _mm_or_si128(flat2_q4, q4); _mm_storeu_si128((__m128i *)(s + 4 * p), q4); q5 = _mm_andnot_si128(flat2, q5); flat2_q5 = _mm_and_si128(flat2, flat2_q5); q5 = _mm_or_si128(flat2_q5, q5); _mm_storeu_si128((__m128i *)(s + 5 * p), q5); q6 = _mm_andnot_si128(flat2, q6); flat2_q6 = _mm_and_si128(flat2, flat2_q6); q6 = _mm_or_si128(flat2_q6, q6); _mm_storeu_si128((__m128i *)(s + 6 * p), q6); } }