static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) { uint8_t levels[16], ctxs[16]; uint16_t abs_levels[16]; int n = res->first; // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1 const int p0 = res->prob[n][ctx0][0]; CostArrayPtr const costs = res->costs; const uint16_t* t = costs[n][ctx0]; // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0 // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll // be missing during the loop. int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0; if (res->last < 0) { return VP8BitCost(0, p0); } { // precompute clamped levels and contexts, packed to 8b. const __m128i zero = _mm_setzero_si128(); const __m128i kCst2 = _mm_set1_epi8(2); const __m128i kCst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL); const __m128i c0 = _mm_loadu_si128((const __m128i*)&res->coeffs[0]); const __m128i c1 = _mm_loadu_si128((const __m128i*)&res->coeffs[8]); const __m128i D0 = _mm_sub_epi16(zero, c0); const __m128i D1 = _mm_sub_epi16(zero, c1); const __m128i E0 = _mm_max_epi16(c0, D0); // abs(v), 16b const __m128i E1 = _mm_max_epi16(c1, D1); const __m128i F = _mm_packs_epi16(E0, E1); const __m128i G = _mm_min_epu8(F, kCst2); // context = 0,1,2 const __m128i H = _mm_min_epu8(F, kCst67); // clamp_level in [0..67] _mm_storeu_si128((__m128i*)&ctxs[0], G); _mm_storeu_si128((__m128i*)&levels[0], H); _mm_storeu_si128((__m128i*)&abs_levels[0], E0); _mm_storeu_si128((__m128i*)&abs_levels[8], E1); } for (; n < res->last; ++n) { const int ctx = ctxs[n]; const int level = levels[n]; const int flevel = abs_levels[n]; // full level cost += VP8LevelFixedCosts[flevel] + t[level]; // simplified VP8LevelCost() t = costs[n + 1][ctx]; } // Last coefficient is always non-zero { const int level = levels[n]; const int flevel = abs_levels[n]; assert(flevel != 0); cost += VP8LevelFixedCosts[flevel] + t[level]; if (n < 15) { const int b = VP8EncBands[n + 1]; const int ctx = ctxs[n]; const int last_p0 = res->prob[b][ctx][0]; cost += VP8BitCost(0, last_p0); } } return cost; }
static void clamphigh_u8_sse (uint8_t *dest, const uint8_t *src1, int n, const uint8_t *src2_1) { __m128i xmm1; uint8_t max = *src2_1; /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { uint8_t x = *src1++; if (x > max) x = max; *dest++ = x; } xmm1 = _mm_set1_epi8(max); for (; n >= 16; n -= 16) { __m128i xmm0; xmm0 = _mm_loadu_si128((__m128i *)src1); xmm0 = _mm_min_epu8(xmm0, xmm1); _mm_store_si128((__m128i *)dest, xmm0); dest += 16; src1 += 16; } for (; n > 0; n--) { uint8_t x = *src1++; if (x > max) x = max; *dest++ = x; } }
// Denoise a 16x1 vector with a weaker filter. static INLINE __m128i vp9_denoiser_adj_16x1_sse2( const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, const __m128i k_0, const __m128i k_delta, __m128i acc_diff) { __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0])); // Calculate differences. const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); // Obtain the sign. FF if diff is negative. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); // Clamp absolute difference to delta to get the adjustment. const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); // Restore the sign and get positive and negative adjustments. __m128i padj, nadj; padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); // Calculate filtered value. v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj); v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); // Accumulate the adjustments. acc_diff = _mm_subs_epi8(acc_diff, padj); acc_diff = _mm_adds_epi8(acc_diff, nadj); return acc_diff; }
static void SkMorph_SSE2(const SkPMColor* src, SkPMColor* dst, int radius, int width, int height, int srcStride, int dstStride) { const int srcStrideX = direction == kX ? 1 : srcStride; const int dstStrideX = direction == kX ? 1 : dstStride; const int srcStrideY = direction == kX ? srcStride : 1; const int dstStrideY = direction == kX ? dstStride : 1; radius = SkMin32(radius, width - 1); const SkPMColor* upperSrc = src + radius * srcStrideX; for (int x = 0; x < width; ++x) { const SkPMColor* lp = src; const SkPMColor* up = upperSrc; SkPMColor* dptr = dst; for (int y = 0; y < height; ++y) { __m128i max = type == kDilate ? _mm_setzero_si128() : _mm_set1_epi32(0xFFFFFFFF); for (const SkPMColor* p = lp; p <= up; p += srcStrideX) { __m128i src_pixel = _mm_cvtsi32_si128(*p); max = type == kDilate ? _mm_max_epu8(src_pixel, max) : _mm_min_epu8(src_pixel, max); } *dptr = _mm_cvtsi128_si32(max); dptr += dstStrideY; lp += srcStrideY; up += srcStrideY; } if (x >= radius) { src += srcStrideX; } if (x + radius < width - 1) { upperSrc += srcStrideX; } dst += dstStrideX; } }
__m128i test_mm_min_epu8(__m128i A, __m128i B) { // DAG-LABEL: test_mm_min_epu8 // DAG: call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // // ASM-LABEL: test_mm_min_epu8 // ASM: pminub return _mm_min_epu8(A, B); }
mlib_status __mlib_VectorConvert_S8_U8_Sat( mlib_s8 *z, const mlib_u8 *x, mlib_s32 n) { if (n < 1) return (MLIB_FAILURE); mlib_s32 i, ax, az, nstep, n1, n2, n3, xval; mlib_u8 *px = (mlib_u8 *)x; mlib_s8 *pz = (mlib_s8 *)z; __m128i zbuf, xbuf, mask; mask = _mm_set1_epi8(127); ax = (mlib_addr)x & 15; az = (mlib_addr)z & 15; nstep = 16 / sizeof (mlib_u8); n1 = ((16 - ax) & 15) / sizeof (mlib_u8); n2 = (n - n1) / nstep; n3 = n - n1 - n2 * nstep; if (n2 < 1) { for (i = 0; i < n; i++) { xval = *px++; if (xval > 127) xval = 127; *pz++ = xval; } } else { for (i = 0; i < n1; i++) { xval = *px++; if (xval > 127) xval = 127; *pz++ = xval; } for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); zbuf = _mm_min_epu8(xbuf, mask); _mm_storeu_si128((__m128i *)pz, zbuf); px += nstep; pz += nstep; } for (i = 0; i < n3; i++) { xval = *px++; if (xval > 127) xval = 127; *pz++ = xval; } } return (MLIB_SUCCESS); }
void FREAK::extractDescriptor(uchar *pointsValue, void ** ptr) const { __m128i** ptrSSE = (__m128i**) ptr; // note that comparisons order is modified in each block (but first 128 comparisons remain globally the same-->does not affect the 128,384 bits segmanted matching strategy) int cnt = 0; for( int n = FREAK_NB_PAIRS/128; n-- ; ) { __m128i result128 = _mm_setzero_si128(); for( int m = 128/16; m--; cnt += 16 ) { __m128i operand1 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].i], pointsValue[descriptionPairs[cnt+1].i], pointsValue[descriptionPairs[cnt+2].i], pointsValue[descriptionPairs[cnt+3].i], pointsValue[descriptionPairs[cnt+4].i], pointsValue[descriptionPairs[cnt+5].i], pointsValue[descriptionPairs[cnt+6].i], pointsValue[descriptionPairs[cnt+7].i], pointsValue[descriptionPairs[cnt+8].i], pointsValue[descriptionPairs[cnt+9].i], pointsValue[descriptionPairs[cnt+10].i], pointsValue[descriptionPairs[cnt+11].i], pointsValue[descriptionPairs[cnt+12].i], pointsValue[descriptionPairs[cnt+13].i], pointsValue[descriptionPairs[cnt+14].i], pointsValue[descriptionPairs[cnt+15].i]); __m128i operand2 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].j], pointsValue[descriptionPairs[cnt+1].j], pointsValue[descriptionPairs[cnt+2].j], pointsValue[descriptionPairs[cnt+3].j], pointsValue[descriptionPairs[cnt+4].j], pointsValue[descriptionPairs[cnt+5].j], pointsValue[descriptionPairs[cnt+6].j], pointsValue[descriptionPairs[cnt+7].j], pointsValue[descriptionPairs[cnt+8].j], pointsValue[descriptionPairs[cnt+9].j], pointsValue[descriptionPairs[cnt+10].j], pointsValue[descriptionPairs[cnt+11].j], pointsValue[descriptionPairs[cnt+12].j], pointsValue[descriptionPairs[cnt+13].j], pointsValue[descriptionPairs[cnt+14].j], pointsValue[descriptionPairs[cnt+15].j]); __m128i workReg = _mm_min_epu8(operand1, operand2); // emulated "not less than" for 8-bit UNSIGNED integers workReg = _mm_cmpeq_epi8(workReg, operand2); // emulated "not less than" for 8-bit UNSIGNED integers workReg = _mm_and_si128(_mm_set1_epi16(short(0x8080 >> m)), workReg); // merge the last 16 bits with the 128bits std::vector until full result128 = _mm_or_si128(result128, workReg); } (**ptrSSE) = result128; ++(*ptrSSE); } (*ptrSSE) -= 8; }
SIMDValue SIMDUint8x16Operation::OpMin(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_min_epu8(tmpaValue.m128i_value, tmpbValue.m128i_value); return X86SIMDValue::ToSIMDValue(x86Result); }
__m64 _m_pminub(__m64 _MM1, __m64 _MM2) { __m128i lhs = {0}, rhs = {0}; lhs.m128i_i64[0] = _MM1.m64_i64; rhs.m128i_i64[0] = _MM2.m64_i64; lhs = _mm_min_epu8(lhs, rhs); _MM1.m64_i64 = lhs.m128i_i64[0]; return _MM1; }
static FORCE_INLINE __m128i mm_min_epu(const __m128i &a, const __m128i &b) { if (sizeof(PixelType) == 1) return _mm_min_epu8(a, b); else { __m128i word_32768 = _mm_set1_epi16(32768); __m128i a_minus = _mm_sub_epi16(a, word_32768); __m128i b_minus = _mm_sub_epi16(b, word_32768); return _mm_add_epi16(_mm_min_epi16(a_minus, b_minus), word_32768); } }
void exponent_min_sse2(uint8_t *expTarget, uint8_t *exp, uint8_t *exp1, int n) { int i; for (i = 0; i < (n & ~15); i += 16) { __m128i vexp = _mm_loadu_si128((__m128i*)&exp[i]); __m128i vexp1 = _mm_loadu_si128((__m128i*)&exp1[i]); vexp = _mm_min_epu8(vexp, vexp1); _mm_storeu_si128 ((__m128i*)&expTarget[i], vexp); } for (; i < n; ++i) expTarget[i] = MIN(exp[i], exp1[i]); }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp, int th) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *orig = p0, *end = p2; line_copy8(p0, srcp + stride, width, 1); line_copy8(p1, srcp, width, 1); uint8_t threshold = (uint8_t)th; __m128i zero = _mm_setzero_si128(); __m128i xth = _mm_set1_epi8((int8_t)threshold); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 1 ? 1 : -1); line_copy8(p2, srcp, width, 1); uint8_t *coordinates[] = COORDINATES; for (int x = 0; x < width; x += 16) { __m128i sumlo = zero; __m128i sumhi = zero; for (int i = 0; i < 8; i++) { __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x)); sumlo = _mm_add_epi16(sumlo, _mm_unpacklo_epi8(target, zero)); sumhi = _mm_add_epi16(sumhi, _mm_unpackhi_epi8(target, zero)); } sumlo = _mm_srai_epi16(sumlo, 3); sumhi = _mm_srai_epi16(sumhi, 3); sumlo = _mm_packus_epi16(sumlo, sumhi); __m128i src = _mm_load_si128((__m128i *)(p1 + x)); __m128i limit = _mm_adds_epu8(src, xth); sumlo = _mm_max_epu8(sumlo, src); sumlo = _mm_min_epu8(sumlo, limit); _mm_store_si128((__m128i *)(dstp + x), sumlo); } dstp += stride; p0 = p1; p1 = p2; p2 = (p2 == end) ? orig : p2 + bstride; } }
// Denoise a 16x1 vector. static INLINE __m128i vp9_denoiser_16x1_sse2( const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, const __m128i *k_0, const __m128i *k_4, const __m128i *k_8, const __m128i *k_16, const __m128i *l3, const __m128i *l32, const __m128i *l21, __m128i acc_diff) { // Calculate differences const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); __m128i v_running_avg_y; const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); // Obtain the sign. FF if diff is negative. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0); // Clamp absolute difference to 16 to be used to get mask. Doing this // allows us to use _mm_cmpgt_epi8, which operates on signed byte. const __m128i clamped_absdiff = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16); // Get masks for l2 l1 and l0 adjustments. const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff); const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff); const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff); // Get adjustments for l2, l1, and l0. __m128i adj2 = _mm_and_si128(mask2, *l32); const __m128i adj1 = _mm_and_si128(mask1, *l21); const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); __m128i adj, padj, nadj; // Combine the adjustments and get absolute adjustments. adj2 = _mm_add_epi8(adj2, adj1); adj = _mm_sub_epi8(*l3, adj2); adj = _mm_andnot_si128(mask0, adj); adj = _mm_or_si128(adj, adj0); // Restore the sign and get positive and negative adjustments. padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); // Calculate filtered value. v_running_avg_y = _mm_adds_epu8(v_sig, padj); v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); // Adjustments <=7, and each element in acc_diff can fit in signed // char. acc_diff = _mm_adds_epi8(acc_diff, padj); acc_diff = _mm_subs_epi8(acc_diff, nadj); return acc_diff; }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp, int th, int *enable) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *orig = p0, *end = p2; uint8_t threshold = th > 255 ? 255 : (uint8_t)th; line_copy8(p0, srcp, width, 1); line_copy8(p1, srcp, width, 1); __m128i xth = _mm_set1_epi8((int8_t)threshold); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 1 ? 1 : -1); line_copy8(p2, srcp, width, 1); uint8_t *coordinates[] = {p0 - 1, p0, p0 + 1, p1 - 1, p1 + 1, p2 - 1, p2, p2 + 1}; for (int x = 0; x < width; x += 16) { __m128i src = _mm_load_si128((__m128i *)(p1 + x)); __m128i min = src; for (int i = 0; i < 8; i++) { if (enable[i]) { __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x)); min = _mm_min_epu8(target, min); } } __m128i limit = _mm_subs_epu8(src, xth); min = _mm_max_epu8(min, limit); _mm_store_si128((__m128i *)(dstp + x), min); } dstp += stride; p0 = p1; p1 = p2; p2 = (p2 == end) ? orig : p2 + bstride; } }
/// any (*p > 2) is set to be 3 COREARRAY_DLL_DEFAULT void vec_u8_geno_valid(C_UInt8 *p, size_t n) { #if defined(COREARRAY_SIMD_SSE2) // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p++) if (*p > 3) *p = 3; const __m128i zero = _mm_setzero_si128(); const __m128i three = _mm_set1_epi8(3); for (; n >= 16; n-=16, p+=16) { __m128i v = _mm_load_si128((__m128i*)p); __m128i mask = _mm_or_si128(_mm_cmplt_epi8(v, zero), _mm_cmplt_epi8(three, v)); if (_mm_movemask_epi8(mask) > 0) _mm_store_si128((__m128i*)p, _mm_min_epu8(v, three)); } #endif for (; n > 0; n--, p++) if (*p > 3) *p = 3; }
// count genotype sum and number of calls, not requiring 16-aligned p COREARRAY_DLL_DEFAULT C_UInt8* vec_u8_geno_count(C_UInt8 *p, size_t n, C_Int32 &out_sum, C_Int32 &out_num) { C_Int32 sum=0, num=0; #if defined(COREARRAY_SIMD_AVX2) const __m256i three = _mm256_set1_epi8(3); const __m256i zero = _mm256_setzero_si256(); __m256i sum32 = zero, num32 = zero; size_t limit_by_U8 = 0; for (; n >= 32; ) { __m256i v = _mm256_loadu_si256((__m256i const*)p); p += 32; __m256i m = _mm256_cmpgt_epi8(three, _mm256_min_epu8(v, three)); sum32 = _mm256_add_epi8(sum32, _mm256_and_si256(v, m)); num32 = _mm256_sub_epi8(num32, m); n -= 32; limit_by_U8 ++; if ((limit_by_U8 >= 127) || (n < 32)) { // add to sum sum32 = _mm256_sad_epu8(sum32, zero); sum32 = _mm256_add_epi32(sum32, _mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(1,0,3,2))); sum32 = _mm256_add_epi32(sum32, _mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(0,0,0,1))); sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(sum32)); // add to num num32 = _mm256_sad_epu8(num32, zero); num32 = _mm256_add_epi32(num32, _mm256_permute4x64_epi64(num32, _MM_SHUFFLE(1,0,3,2))); num32 = _mm256_add_epi32(num32, _mm256_permute4x64_epi64(num32, _MM_SHUFFLE(0,0,0,1))); num += _mm_cvtsi128_si32(_mm256_castsi256_si128(num32)); // reset sum32 = num32 = zero; limit_by_U8 = 0; } } #elif defined(COREARRAY_SIMD_SSE2) // header, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p++) if (*p <= 2) { sum += *p; num++; } const __m128i three = _mm_set1_epi8(3); const __m128i zero = _mm_setzero_si128(); __m128i sum16=zero, num16=zero; size_t limit_by_U8 = 0; for (; n >= 16; ) { __m128i v = _mm_load_si128((__m128i const*)p); p += 16; __m128i m = _mm_cmpgt_epi8(three, _mm_min_epu8(v, three)); sum16 = _mm_add_epi8(sum16, v & m); num16 = _mm_sub_epi8(num16, m); n -= 16; limit_by_U8 ++; if ((limit_by_U8 >= 127) || (n < 16)) { // add to sum sum16 = _mm_sad_epu8(sum16, zero); sum += _mm_cvtsi128_si32(sum16); sum += _mm_cvtsi128_si32(_mm_shuffle_epi32(sum16, 2)); // add to num num16 = _mm_sad_epu8(num16, zero); num += _mm_cvtsi128_si32(num16); num += _mm_cvtsi128_si32(_mm_shuffle_epi32(num16, 2)); // reset sum16 = num16 = zero; limit_by_U8 = 0; } } #endif for (; n > 0; n--, p++) if (*p <= 2) { sum += *p; num++; } out_sum = sum; out_num = num; return p; }
template <> SIMD_INLINE __m128i OperationBinary8u<SimdOperationBinary8uMinimum>(const __m128i & a, const __m128i & b) { return _mm_min_epu8(a, b); }
int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising) { unsigned char *running_avg_y_start = running_avg_y; unsigned char *sig_start = sig; unsigned int sum_diff_thresh; int r; int shift_inc = (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; __m128i acc_diff = _mm_setzero_si128(); const __m128i k_0 = _mm_setzero_si128(); const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); const __m128i k_8 = _mm_set1_epi8(8); const __m128i k_16 = _mm_set1_epi8(16); /* Modify each level's adjustment according to motion_magnitude. */ const __m128i l3 = _mm_set1_epi8( (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); /* Difference between level 3 and level 2 is 2. */ const __m128i l32 = _mm_set1_epi8(2); /* Difference between level 2 and level 1 is 1. */ const __m128i l21 = _mm_set1_epi8(1); for (r = 0; r < 16; ++r) { /* Calculate differences */ const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0])); __m128i v_running_avg_y; const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); /* Obtain the sign. FF if diff is negative. */ const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); /* Clamp absolute difference to 16 to be used to get mask. Doing this * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */ const __m128i clamped_absdiff = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16); /* Get masks for l2 l1 and l0 adjustments */ const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); /* Get adjustments for l2, l1, and l0 */ __m128i adj2 = _mm_and_si128(mask2, l32); const __m128i adj1 = _mm_and_si128(mask1, l21); const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); __m128i adj, padj, nadj; /* Combine the adjustments and get absolute adjustments. */ adj2 = _mm_add_epi8(adj2, adj1); adj = _mm_sub_epi8(l3, adj2); adj = _mm_andnot_si128(mask0, adj); adj = _mm_or_si128(adj, adj0); /* Restore the sign and get positive and negative adjustments. */ padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); /* Calculate filtered value. */ v_running_avg_y = _mm_adds_epu8(v_sig, padj); v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); /* Adjustments <=7, and each element in acc_diff can fit in signed * char. */ acc_diff = _mm_adds_epi8(acc_diff, padj); acc_diff = _mm_subs_epi8(acc_diff, nadj); /* Update pointers for next iteration. */ sig += sig_stride; mc_running_avg_y += mc_avg_y_stride; running_avg_y += avg_y_stride; } { /* Compute the sum of all pixel differences of this MB. */ unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); sum_diff_thresh = SUM_DIFF_THRESHOLD; if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; if (abs_sum_diff > sum_diff_thresh) { // Before returning to copy the block (i.e., apply no denoising), // check if we can still apply some (weaker) temporal filtering to // this block, that would otherwise not be denoised at all. Simplest // is to apply an additional adjustment to running_avg_y to bring it // closer to sig. The adjustment is capped by a maximum delta, and // chosen such that in most cases the resulting sum_diff will be // within the acceptable range given by sum_diff_thresh. // The delta is set by the excess of absolute pixel diff over the // threshold. int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; // Only apply the adjustment for max delta up to 3. if (delta < 4) { const __m128i k_delta = _mm_set1_epi8(delta); sig -= sig_stride * 16; mc_running_avg_y -= mc_avg_y_stride * 16; running_avg_y -= avg_y_stride * 16; for (r = 0; r < 16; ++r) { __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0])); // Calculate differences. const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0])); const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); // Obtain the sign. FF if diff is negative. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); // Clamp absolute difference to delta to get the adjustment. const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); // Restore the sign and get positive and negative adjustments. __m128i padj, nadj; padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); // Calculate filtered value. v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj); v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); // Accumulate the adjustments. acc_diff = _mm_subs_epi8(acc_diff, padj); acc_diff = _mm_adds_epi8(acc_diff, nadj); // Update pointers for next iteration. sig += sig_stride; mc_running_avg_y += mc_avg_y_stride; running_avg_y += avg_y_stride; } abs_sum_diff = abs_sum_diff_16x1(acc_diff); if (abs_sum_diff > sum_diff_thresh) { return COPY_BLOCK; } } else { return COPY_BLOCK; } } }
void GetMinMaxColors_Intrinsics( const byte *colorBlock, byte *minColor, byte *maxColor ) { __m128i t0, t1, t3, t4, t6, t7; // get bounding box // ---------------- // load the first row t0 = _mm_load_si128 ( (__m128i*) colorBlock ); t1 = _mm_load_si128 ( (__m128i*) colorBlock ); __m128i t16 = _mm_load_si128 ( (__m128i*) (colorBlock+16) ); // Minimum of Packed Unsigned Byte Integers t0 = _mm_min_epu8 ( t0, t16); // Maximum of Packed Unsigned Byte Integers t1 = _mm_max_epu8 ( t1, t16); __m128i t32 = _mm_load_si128 ( (__m128i*) (colorBlock+32) ); t0 = _mm_min_epu8 ( t0, t32); t1 = _mm_max_epu8 ( t1, t32); __m128i t48 = _mm_load_si128 ( (__m128i*) (colorBlock+48) ); t0 = _mm_min_epu8 ( t0, t48); t1 = _mm_max_epu8 ( t1, t48); // Shuffle Packed Doublewords t3 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t4 = _mm_shuffle_epi32( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t0 = _mm_min_epu8 ( t0, t3); t1 = _mm_max_epu8 ( t1, t4); // Shuffle Packed Low Words t6 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t7 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t0 = _mm_min_epu8 ( t0, t6); t1 = _mm_max_epu8 ( t1, t7); // inset the bounding box // ---------------------- // Unpack Low Data //__m128i t66 = _mm_set1_epi8( 0 ); __m128i t66 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_0 ); t0 = _mm_unpacklo_epi8(t0, t66); t1 = _mm_unpacklo_epi8(t1, t66); // copy (movdqa) //__m128i t2 = _mm_load_si128 ( &t1 ); __m128i t2 = t1; // Subtract Packed Integers t2 = _mm_sub_epi16(t2, t0); // Shift Packed Data Right Logical t2 = _mm_srli_epi16(t2, INSET_SHIFT); // Add Packed Integers t0 = _mm_add_epi16(t0, t2); t1 = _mm_sub_epi16(t1, t2); // Pack with Unsigned Saturation t0 = _mm_packus_epi16(t0, t0); t1 = _mm_packus_epi16(t1, t1); // store bounding box extents // -------------------------- _mm_store_si128 ( (__m128i*) minColor, t0 ); _mm_store_si128 ( (__m128i*) maxColor, t1 ); }
int vp8_denoiser_filter_sse2(YV12_BUFFER_CONFIG *mc_running_avg, YV12_BUFFER_CONFIG *running_avg, MACROBLOCK *signal, unsigned int motion_magnitude, int y_offset, int uv_offset) { unsigned char *sig = signal->thismb; int sig_stride = 16; unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset; int mc_avg_y_stride = mc_running_avg->y_stride; unsigned char *running_avg_y = running_avg->y_buffer + y_offset; int avg_y_stride = running_avg->y_stride; int r; (void)uv_offset; __m128i acc_diff = _mm_setzero_si128(); const __m128i k_0 = _mm_setzero_si128(); const __m128i k_4 = _mm_set1_epi8(4); const __m128i k_8 = _mm_set1_epi8(8); const __m128i k_16 = _mm_set1_epi8(16); /* Modify each level's adjustment according to motion_magnitude. */ const __m128i l3 = _mm_set1_epi8( (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 : 6); /* Difference between level 3 and level 2 is 2. */ const __m128i l32 = _mm_set1_epi8(2); /* Difference between level 2 and level 1 is 1. */ const __m128i l21 = _mm_set1_epi8(1); for (r = 0; r < 16; ++r) { /* Calculate differences */ const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128( (__m128i *)(&mc_running_avg_y[0])); __m128i v_running_avg_y; const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); /* Obtain the sign. FF if diff is negative. */ const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); /* Clamp absolute difference to 16 to be used to get mask. Doing this * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */ const __m128i clamped_absdiff = _mm_min_epu8( _mm_or_si128(pdiff, ndiff), k_16); /* Get masks for l2 l1 and l0 adjustments */ const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); /* Get adjustments for l2, l1, and l0 */ __m128i adj2 = _mm_and_si128(mask2, l32); const __m128i adj1 = _mm_and_si128(mask1, l21); const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); __m128i adj, padj, nadj; /* Combine the adjustments and get absolute adjustments. */ adj2 = _mm_add_epi8(adj2, adj1); adj = _mm_sub_epi8(l3, adj2); adj = _mm_andnot_si128(mask0, adj); adj = _mm_or_si128(adj, adj0); /* Restore the sign and get positive and negative adjustments. */ padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); /* Calculate filtered value. */ v_running_avg_y = _mm_adds_epu8(v_sig, padj); v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); /* Adjustments <=7, and each element in acc_diff can fit in signed * char. */ acc_diff = _mm_adds_epi8(acc_diff, padj); acc_diff = _mm_subs_epi8(acc_diff, nadj); /* Update pointers for next iteration. */ sig += sig_stride; mc_running_avg_y += mc_avg_y_stride; running_avg_y += avg_y_stride; } { /* Compute the sum of all pixel differences of this MB. */ union sum_union s; int sum_diff = 0; s.v = acc_diff; sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5] + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11] + s.e[12] + s.e[13] + s.e[14] + s.e[15]; if (abs(sum_diff) > SUM_DIFF_THRESHOLD) { return COPY_BLOCK; } } vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride, signal->thismb, sig_stride); return FILTER_BLOCK; }
void SGMStereo::calcPixelwiseSAD(const unsigned char* leftSobelRow, const unsigned char* rightSobelRow) { calcHalfPixelRight(rightSobelRow); for (int x = 0; x < 16; ++x) { int leftCenterValue = leftSobelRow[x]; int leftHalfLeftValue = x > 0 ? (leftCenterValue + leftSobelRow[x - 1])/2 : leftCenterValue; int leftHalfRightValue = x < width_ - 1 ? (leftCenterValue + leftSobelRow[x + 1])/2 : leftCenterValue; int leftMinValue = std::min(leftHalfLeftValue, leftHalfRightValue); leftMinValue = std::min(leftMinValue, leftCenterValue); int leftMaxValue = std::max(leftHalfLeftValue, leftHalfRightValue); leftMaxValue = std::max(leftMaxValue, leftCenterValue); for (int d = 0; d <= x; ++d) { int rightCenterValue = rightSobelRow[width_ - 1 - x + d]; int rightMinValue = halfPixelRightMin_[width_ - 1 - x + d]; int rightMaxValue = halfPixelRightMax_[width_ - 1 - x + d]; int costLtoR = std::max(0, leftCenterValue - rightMaxValue); costLtoR = std::max(costLtoR, rightMinValue - leftCenterValue); int costRtoL = std::max(0, rightCenterValue - leftMaxValue); costRtoL = std::max(costRtoL, leftMinValue - rightCenterValue); int costValue = std::min(costLtoR, costRtoL); pixelwiseCostRow_[disparityTotal_*x + d] = costValue; } for (int d = x + 1; d < disparityTotal_; ++d) { pixelwiseCostRow_[disparityTotal_*x + d] = pixelwiseCostRow_[disparityTotal_*x + d - 1]; } } for (int x = 16; x < disparityTotal_; ++x) { int leftCenterValue = leftSobelRow[x]; int leftHalfLeftValue = x > 0 ? (leftCenterValue + leftSobelRow[x - 1])/2 : leftCenterValue; int leftHalfRightValue = x < width_ - 1 ? (leftCenterValue + leftSobelRow[x + 1])/2 : leftCenterValue; int leftMinValue = std::min(leftHalfLeftValue, leftHalfRightValue); leftMinValue = std::min(leftMinValue, leftCenterValue); int leftMaxValue = std::max(leftHalfLeftValue, leftHalfRightValue); leftMaxValue = std::max(leftMaxValue, leftCenterValue); __m128i registerLeftCenterValue = _mm_set1_epi8(static_cast<char>(leftCenterValue)); __m128i registerLeftMinValue = _mm_set1_epi8(static_cast<char>(leftMinValue)); __m128i registerLeftMaxValue = _mm_set1_epi8(static_cast<char>(leftMaxValue)); for (int d = 0; d < x/16; d += 16) { __m128i registerRightCenterValue = _mm_loadu_si128(reinterpret_cast<const __m128i*>(rightSobelRow + width_ - 1 - x + d)); __m128i registerRightMinValue = _mm_loadu_si128(reinterpret_cast<const __m128i*>(halfPixelRightMin_ + width_ - 1 - x + d)); __m128i registerRightMaxValue = _mm_loadu_si128(reinterpret_cast<const __m128i*>(halfPixelRightMax_ + width_ - 1 - x + d)); __m128i registerCostLtoR = _mm_max_epu8(_mm_subs_epu8(registerLeftCenterValue, registerRightMaxValue), _mm_subs_epu8(registerRightMinValue, registerLeftCenterValue)); __m128i registerCostRtoL = _mm_max_epu8(_mm_subs_epu8(registerRightCenterValue, registerLeftMaxValue), _mm_subs_epu8(registerLeftMinValue, registerRightCenterValue)); __m128i registerCost = _mm_min_epu8(registerCostLtoR, registerCostRtoL); _mm_store_si128(reinterpret_cast<__m128i*>(pixelwiseCostRow_ + disparityTotal_*x + d), registerCost); } for (int d = x/16; d <= x; ++d) { int rightCenterValue = rightSobelRow[width_ - 1 - x + d]; int rightMinValue = halfPixelRightMin_[width_ - 1 - x + d]; int rightMaxValue = halfPixelRightMax_[width_ - 1 - x + d]; int costLtoR = std::max(0, leftCenterValue - rightMaxValue); costLtoR = std::max(costLtoR, rightMinValue - leftCenterValue); int costRtoL = std::max(0, rightCenterValue - leftMaxValue); costRtoL = std::max(costRtoL, leftMinValue - rightCenterValue); int costValue = std::min(costLtoR, costRtoL); pixelwiseCostRow_[disparityTotal_*x + d] = costValue; } for (int d = x + 1; d < disparityTotal_; ++d) { pixelwiseCostRow_[disparityTotal_*x + d] = pixelwiseCostRow_[disparityTotal_*x + d - 1]; } } for (int x = disparityTotal_; x < width_; ++x) { int leftCenterValue = leftSobelRow[x]; int leftHalfLeftValue = x > 0 ? (leftCenterValue + leftSobelRow[x - 1])/2 : leftCenterValue; int leftHalfRightValue = x < width_ - 1 ? (leftCenterValue + leftSobelRow[x + 1])/2 : leftCenterValue; int leftMinValue = std::min(leftHalfLeftValue, leftHalfRightValue); leftMinValue = std::min(leftMinValue, leftCenterValue); int leftMaxValue = std::max(leftHalfLeftValue, leftHalfRightValue); leftMaxValue = std::max(leftMaxValue, leftCenterValue); __m128i registerLeftCenterValue = _mm_set1_epi8(static_cast<char>(leftCenterValue)); __m128i registerLeftMinValue = _mm_set1_epi8(static_cast<char>(leftMinValue)); __m128i registerLeftMaxValue = _mm_set1_epi8(static_cast<char>(leftMaxValue)); for (int d = 0; d < disparityTotal_; d += 16) { __m128i registerRightCenterValue = _mm_loadu_si128(reinterpret_cast<const __m128i*>(rightSobelRow + width_ - 1 - x + d)); __m128i registerRightMinValue = _mm_loadu_si128(reinterpret_cast<const __m128i*>(halfPixelRightMin_ + width_ - 1 - x + d)); __m128i registerRightMaxValue = _mm_loadu_si128(reinterpret_cast<const __m128i*>(halfPixelRightMax_ + width_ - 1 - x + d)); __m128i registerCostLtoR = _mm_max_epu8(_mm_subs_epu8(registerLeftCenterValue, registerRightMaxValue), _mm_subs_epu8(registerRightMinValue, registerLeftCenterValue)); __m128i registerCostRtoL = _mm_max_epu8(_mm_subs_epu8(registerRightCenterValue, registerLeftMaxValue), _mm_subs_epu8(registerLeftMinValue, registerRightCenterValue)); __m128i registerCost = _mm_min_epu8(registerCostLtoR, registerCostRtoL); _mm_store_si128(reinterpret_cast<__m128i*>(pixelwiseCostRow_ + disparityTotal_*x + d), registerCost); } } }
/** * Calculate output of given chromosome and inputs using SSE instructions * @param chr * @param inputs * @param outputs */ void cgp_get_output_sse(ga_chr_t chromosome, __m128i_aligned inputs[CGP_INPUTS], __m128i_aligned outputs[CGP_OUTPUTS]) { #ifdef SSE2 assert(CGP_OUTPUTS == 1); assert(CGP_ROWS == 4); assert(CGP_LBACK == 1); // previous and currently computed column register __m128i prev0, prev1, prev2, prev3; register __m128i current0, current1, current2, current3; // 0xFF constant static __m128i_aligned FF; FF = _mm_set1_epi8(0xFF); cgp_genome_t genome = (cgp_genome_t) chromosome->genome; /* if primary output is connected to primary input, skip evaluation This cannot happen - CGP does not generate circuits like that if (genome->outputs[0] < CGP_INPUTS) { int i = genome->outputs[0]; _mm_store_si128(&outputs[0], inputs[i]); return; } */ #ifdef TEST_EVAL_SSE2 for (int i = 0; i < CGP_INPUTS; i++) { unsigned char *_tmp = (unsigned char*) &inputs[i]; printf("I: %2d = " UCFMT16 "\n", i, UCVAL16(0)); } #endif int offset = -CGP_ROWS; for (int x = 0; x < CGP_COLS; x++) { for (int y = 0; y < CGP_ROWS; y++) { int idx = cgp_node_index(x, y); cgp_node_t *n = &(genome->nodes[idx]); // skip inactive blocks if (!n->is_active) continue; register __m128i A; register __m128i B; register __m128i Y; register __m128i TMP; register __m128i mask; LOAD_INPUT(A, n->inputs[0]); LOAD_INPUT(B, n->inputs[1]); switch (n->function) { case c255: Y = FF; break; case identity: Y = A; break; case inversion: Y = _mm_sub_epi8(FF, A); break; case b_or: Y = _mm_or_si128(A, B); break; case b_not1or2: // we don't have NOT instruction, we need to XOR with FF Y = _mm_xor_si128(FF, A); Y = _mm_or_si128(Y, B); break; case b_and: Y = _mm_and_si128(A, B); break; case b_nand: Y = _mm_and_si128(A, B); Y = _mm_xor_si128(FF, Y); break; case b_xor: Y = _mm_xor_si128(A, B); break; case rshift1: // no SR instruction for 8bit data, we need to shift // 16 bits and apply mask // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H] // SHR: [ 0 1 2 3 4 5 6 7 | 8 A B C D E F G] // MSK: [ 0 1 2 3 4 5 6 7 | 0 A B C D E F G] mask = _mm_set1_epi8(0x7F); Y = _mm_srli_epi16(A, 1); Y = _mm_and_si128(Y, mask); break; case rshift2: // similar to rshift1 // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H] // SHR: [ 0 0 1 2 3 4 5 6 | 7 8 A B C D E F] // MSK: [ 0 0 1 2 3 4 5 6 | 0 0 A B C D E F] mask = _mm_set1_epi8(0x3F); Y = _mm_srli_epi16(A, 2); Y = _mm_and_si128(Y, mask); break; case swap: // SWAP(A, B) (((A & 0x0F) << 4) | ((B & 0x0F))) // Shift A left by 4 bits // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H] // SHL: [ 5 6 7 8 A B C D | E F G H 0 0 0 0] // MSK: [ 5 6 7 8 0 0 0 0 | E F G H 0 0 0 0] mask = _mm_set1_epi8(0xF0); TMP = _mm_slli_epi16(A, 4); TMP = _mm_and_si128(TMP, mask); // Mask B // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H] // MSK: [ 0 0 0 0 5 6 7 8 | 0 0 0 0 E F G H] mask = _mm_set1_epi8(0x0F); Y = _mm_and_si128(B, mask); // Combine Y = _mm_or_si128(Y, TMP); break; case add: Y = _mm_add_epi8(A, B); break; case add_sat: Y = _mm_adds_epu8(A, B); break; case avg: // shift right first, then add, to avoid overflow mask = _mm_set1_epi8(0x7F); TMP = _mm_srli_epi16(A, 1); TMP = _mm_and_si128(TMP, mask); Y = _mm_srli_epi16(B, 1); Y = _mm_and_si128(Y, mask); Y = _mm_add_epi8(Y, TMP); break; case max: Y = _mm_max_epu8(A, B); break; case min: Y = _mm_min_epu8(A, B); break; } #ifdef TEST_EVAL_SSE2 __m128i _tmpval = Y; unsigned char *_tmp = (unsigned char*) &_tmpval; printf("N: %2d = " UCFMT16 "\n", idx + CGP_INPUTS, UCVAL16(0)); bool mismatch = false; for (int i = 1; i < 16; i++) { if (_tmp[i] != _tmp[0]) { fprintf(stderr, "Value mismatch on index %2d (%u instead of %u)\n", i, _tmp[i], _tmp[0]); mismatch = true; } } if (mismatch) { abort(); } #endif if (idx + CGP_INPUTS == genome->outputs[0]) { _mm_store_si128(&outputs[0], Y); #ifndef TEST_EVAL_SSE2 return; #endif } ASSIGN_CURRENT(y, Y); } // end of column offset += CGP_ROWS; prev0 = current0; prev1 = current1; prev2 = current2; prev3 = current3; } // end of row #ifdef TEST_EVAL_SSE2 for (int i = 0; i < CGP_OUTPUTS; i++) { unsigned char *_tmp = (unsigned char*) &outputs[i]; printf("O: %2d = " UCFMT16 "\n", i, UCVAL16(0)); } #endif #endif }
/// Element-wise minimum. /// @ingroup SIMD inline xmm_u8 min(const xmm_u8 &a, const xmm_u8 &b) { return _mm_min_epu8(a, b); }
void encode_exp_blk_ch_sse2(uint8_t *exp, int ncoefs, int exp_strategy) { int grpsize, ngrps, i, k, exp_min1, exp_min2; uint8_t v; ngrps = nexpgrptab[exp_strategy-1][ncoefs] * 3; grpsize = exp_strategy + (exp_strategy == EXP_D45); // for D15 strategy, there is no need to group/ungroup exponents switch (grpsize) { case 1: { // constraint for DC exponent exp[0] = MIN(exp[0], 15); // Decrease the delta between each groups to within 2 // so that they can be differentially encoded for (i = 1; i <= ngrps; i++) exp[i] = MIN(exp[i], exp[i-1]+2); for (i = ngrps-1; i >= 0; i--) exp[i] = MIN(exp[i], exp[i+1]+2); return; } // for each group, compute the minimum exponent case 2: { ALIGN16(uint16_t) exp1[256]; ALIGN16(const union __m128iui) vmask = {{0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff}}; i=0; k=1; for(; i < (ngrps & ~7); i += 8, k += 16) { __m128i v1 = _mm_loadu_si128((__m128i*)&exp[k]); __m128i v2 = _mm_srli_si128(v1, 1); v1 = _mm_and_si128(v1, vmask.v); v1 = _mm_min_epu8(v1, v2); _mm_store_si128((__m128i*)&exp1[i], v1); } switch (ngrps & 7) { case 7: exp1[i] = MIN(exp[k], exp[k+1]); ++i; k += 2; case 6: exp1[i] = MIN(exp[k], exp[k+1]); ++i; k += 2; case 5: exp1[i] = MIN(exp[k], exp[k+1]); ++i; k += 2; case 4: exp1[i] = MIN(exp[k], exp[k+1]); ++i; k += 2; case 3: exp1[i] = MIN(exp[k], exp[k+1]); ++i; k += 2; case 2: exp1[i] = MIN(exp[k], exp[k+1]); ++i; k += 2; case 1: exp1[i] = MIN(exp[k], exp[k+1]); case 0: ; } // constraint for DC exponent exp[0] = MIN(exp[0], 15); // Decrease the delta between each groups to within 2 // so that they can be differentially encoded exp1[0] = MIN(exp1[0], (uint16_t)exp[0]+2); for (i = 1; i < ngrps; i++) exp1[i] = MIN(exp1[i], exp1[i-1]+2); for (i = ngrps-2; i >= 0; i--) exp1[i] = MIN(exp1[i], exp1[i+1]+2); // now we have the exponent values the decoder will see exp[0] = MIN(exp[0], exp1[0]+2); // DC exponent is handled separately i=0; k=1; for (; i < (ngrps & ~7); i += 8, k += 16) { __m128i v1 = _mm_load_si128((__m128i*)&exp1[i]); __m128i v2 = _mm_slli_si128(v1, 1); v1 = _mm_or_si128(v1, v2); _mm_storeu_si128((__m128i*)&exp[k], v1); } switch (ngrps & 7) { case 7: v = (uint8_t)exp1[i]; exp[k] = v; exp[k+1] = v; ++i; k += 2; case 6: v = (uint8_t)exp1[i]; exp[k] = v; exp[k+1] = v; ++i; k += 2; case 5: v = (uint8_t)exp1[i]; exp[k] = v; exp[k+1] = v; ++i; k += 2; case 4: v = (uint8_t)exp1[i]; exp[k] = v; exp[k+1] = v; ++i; k += 2; case 3: v = (uint8_t)exp1[i]; exp[k] = v; exp[k+1] = v; ++i; k += 2; case 2: v = (uint8_t)exp1[i]; exp[k] = v; exp[k+1] = v; ++i; k += 2; case 1: v = (uint8_t)exp1[i]; exp[k] = v; exp[k+1] = v; case 0: ; } return; } default: { ALIGN16(uint32_t) exp1[256]; ALIGN16(const union __m128iui) vmask2 = {{0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff}}; i=0; k=1; for (; i < (ngrps & ~3); i += 4, k += 16) { __m128i v1 = _mm_loadu_si128((__m128i*)&exp[k]); __m128i v2 = _mm_srli_si128(v1, 1); v1 = _mm_min_epu8(v1, v2); v2 = _mm_srli_si128(v1, 2); v1 = _mm_min_epu8(v1, v2); v1 = _mm_and_si128(v1, vmask2.v); _mm_store_si128((__m128i*)&exp1[i], v1); } switch (ngrps & 3) { case 3: exp_min1 = MIN(exp[k ], exp[k+1]); exp_min2 = MIN(exp[k+2], exp[k+3]); exp1[i] = MIN(exp_min1, exp_min2); ++i; k += 4; case 2: exp_min1 = MIN(exp[k ], exp[k+1]); exp_min2 = MIN(exp[k+2], exp[k+3]); exp1[i] = MIN(exp_min1, exp_min2); ++i; k += 4; case 1: exp_min1 = MIN(exp[k ], exp[k+1]); exp_min2 = MIN(exp[k+2], exp[k+3]); exp1[i] = MIN(exp_min1, exp_min2); case 0: ; } // constraint for DC exponent exp[0] = MIN(exp[0], 15); // Decrease the delta between each groups to within 2 // so that they can be differentially encoded exp1[0] = MIN(exp1[0], (uint32_t)exp[0]+2); for (i = 1; i < ngrps; i++) exp1[i] = MIN(exp1[i], exp1[i-1]+2); for (i = ngrps-2; i >= 0; i--) exp1[i] = MIN(exp1[i], exp1[i+1]+2); // now we have the exponent values the decoder will see exp[0] = MIN(exp[0], exp1[0]+2); // DC exponent is handled separately i=0; k=1; for (; i < (ngrps & ~3); i += 4, k += 16) { __m128i v1 = _mm_load_si128((__m128i*)&exp1[i]); __m128i v2 = _mm_slli_si128(v1, 1); v1 = _mm_or_si128(v1, v2); v2 = _mm_slli_si128(v1, 2); v1 = _mm_or_si128(v1, v2); _mm_storeu_si128((__m128i*)&exp[k], v1); } switch (ngrps & 3) { case 3: v = exp1[i]; exp[k] = v; exp[k+1] = v; exp[k+2] = v; exp[k+3] = v; ++i; k += 4; case 2: v = exp1[i]; exp[k] = v; exp[k+1] = v; exp[k+2] = v; exp[k+3] = v; ++i; k += 4; case 1: v = exp1[i]; exp[k] = v; exp[k+1] = v; exp[k+2] = v; exp[k+3] = v; case 0: ; } return; } } }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp, edge_t *eh, uint16_t plane_max) { uint8_t* p0 = buff + 16; uint8_t* p1 = p0 + bstride; uint8_t* p2 = p1 + bstride; uint8_t* p3 = p2 + bstride; uint8_t* p4 = p3 + bstride; uint8_t* orig = p0; uint8_t* end = p4; line_copy8(p0, srcp + 2 * stride, width, 2); line_copy8(p1, srcp + stride, width, 2); line_copy8(p2, srcp, width, 2); srcp += stride; line_copy8(p3, srcp, width, 2); uint8_t th_min = eh->min > 0xFF ? 0xFF : (uint8_t)eh->min; uint8_t th_max = eh->max > 0xFF ? 0xFF : (uint8_t)eh->max; __m128i zero = _mm_setzero_si128(); __m128i ab = _mm_set1_epi16(15); __m128i max = _mm_set1_epi8((int8_t)th_max); __m128i min = _mm_set1_epi8((int8_t)th_min); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy8(p4, srcp, width, 2); uint8_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2}; uint8_t* posv[] = {p0, p1, p3, p4}; for (int x = 0; x < width; x += 16) { __m128i sumx[2] = {zero, zero}; __m128i sumy[2] = {zero, zero}; for (int i = 0; i < 4; i++) { __m128i xmm0, xmm1, xmul; xmul = _mm_load_si128((__m128i *)ar_mulx[i]); xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x)); xmm1 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); sumx[0] = _mm_add_epi16(sumx[0], _mm_mullo_epi16(xmm0, xmul)); sumx[1] = _mm_add_epi16(sumx[1], _mm_mullo_epi16(xmm1, xmul)); xmul = _mm_load_si128((__m128i *)ar_muly[i]); xmm0 = _mm_load_si128((__m128i *)(posv[i] + x)); xmm1 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); sumy[0] = _mm_add_epi16(sumy[0], _mm_mullo_epi16(xmm0, xmul)); sumy[1] = _mm_add_epi16(sumy[1], _mm_mullo_epi16(xmm1, xmul)); } for (int i = 0; i < 2; i++) { __m128i xmax, xmin, mull, mulh; sumx[i] = mm_abs_epi16(sumx[i]); sumy[i] = mm_abs_epi16(sumy[i]); xmax = _mm_max_epi16(sumx[i], sumy[i]); xmin = _mm_min_epi16(sumx[i], sumy[i]); mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmax, zero)), 4); mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmax, zero)), 4); xmax = mm_cast_epi32(mull, mulh); mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmin, zero)), 5); mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmin, zero)), 5); xmin = mm_cast_epi32(mull, mulh); sumx[i] = _mm_adds_epu16(xmax, xmin); sumx[i] = _mm_srli_epi16(sumx[i], eh->rshift); } __m128i out = _mm_packus_epi16(sumx[0], sumx[1]); __m128i temp = _mm_min_epu8(out, max); temp = _mm_cmpeq_epi8(temp, max); out = _mm_or_si128(temp, out); temp = _mm_max_epu8(out, min); temp = _mm_cmpeq_epi8(temp, min); out = _mm_andnot_si128(temp, out); _mm_store_si128((__m128i*)(dstp + x), out); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }