__m64 _m_psubb(__m64 _MM1, __m64 _MM2) { __m128i lhs = {0}, rhs = {0}; lhs.m128i_i64[0] = _MM1.m64_i64; rhs.m128i_i64[0] = _MM2.m64_i64; lhs = _mm_sub_epi8(lhs, rhs); _MM1.m64_i64 = lhs.m128i_i64[0]; return _MM1; }
static void adddiff_sse2_t(Byte *pDst, ptrdiff_t dst_pitch, const Byte *pSrc, ptrdiff_t src_pitch, int width, int height) { int mod32_width = (width / 32) * 32; auto pDst2 = pDst; auto pSrc2 = pSrc; auto v128 = _mm_set1_epi32(0x80808080); for ( int j = 0; j < height; ++j ) { for ( int i = 0; i < mod32_width; i+=32 ) { _mm_prefetch(reinterpret_cast<const char*>(pDst)+i+128, _MM_HINT_T0); _mm_prefetch(reinterpret_cast<const char*>(pSrc)+i+128, _MM_HINT_T0); auto dst = simd_load_si128<mem_mode>(pDst+i); auto dst2 = simd_load_si128<mem_mode>(pDst+i+16); auto src = simd_load_si128<mem_mode>(pSrc+i); auto src2 = simd_load_si128<mem_mode>(pSrc+i+16); auto dstsub = _mm_sub_epi8(dst, v128); auto dstsub2 = _mm_sub_epi8(dst2, v128); auto srcsub = _mm_sub_epi8(src, v128); auto srcsub2 = _mm_sub_epi8(src2, v128); auto added = _mm_adds_epi8(dstsub, srcsub); auto added2 = _mm_adds_epi8(dstsub2, srcsub2); auto result = _mm_add_epi8(added, v128); auto result2 = _mm_add_epi8(added2, v128); simd_store_si128<mem_mode>(pDst+i, result); simd_store_si128<mem_mode>(pDst+i+16, result2); } pDst += dst_pitch; pSrc += src_pitch; } if (width > mod32_width) { adddiff_c(pDst2 + mod32_width, dst_pitch, pSrc2 + mod32_width, src_pitch, width - mod32_width, height); } }
void demod_16qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols) { float *symbolsPtr = (float*) symbols; __m128i *resultPtr = (__m128i*) llr; __m128 symbol1, symbol2, symbol3, symbol4; __m128i symbol_i1, symbol_i2, symbol_i3, symbol_i4, symbol_i, symbol_abs, symbol_12, symbol_34; __m128i offset = _mm_set1_epi8(2*SCALE_BYTE_CONV_QAM16/sqrt(10)); __m128i result1n, result1a, result2n, result2a; __m128 scale_v = _mm_set1_ps(-SCALE_BYTE_CONV_QAM16); __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0); __m128i shuffle_abs_1 = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff); __m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); __m128i shuffle_abs_2 = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff); for (int i=0;i<nsymbols/8;i++) { symbol1 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol2 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol3 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol4 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v)); symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v)); symbol_i3 = _mm_cvtps_epi32(_mm_mul_ps(symbol3, scale_v)); symbol_i4 = _mm_cvtps_epi32(_mm_mul_ps(symbol4, scale_v)); symbol_12 = _mm_packs_epi32(symbol_i1, symbol_i2); symbol_34 = _mm_packs_epi32(symbol_i3, symbol_i4); symbol_i = _mm_packs_epi16(symbol_12, symbol_34); symbol_abs = _mm_abs_epi8(symbol_i); symbol_abs = _mm_sub_epi8(symbol_abs, offset); result1n = _mm_shuffle_epi8(symbol_i, shuffle_negated_1); result1a = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1); result2n = _mm_shuffle_epi8(symbol_i, shuffle_negated_2); result2a = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2); _mm_store_si128(resultPtr, _mm_or_si128(result1n, result1a)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(result2n, result2a)); resultPtr++; } // Demodulate last symbols for (int i=8*(nsymbols/8);i<nsymbols;i++) { short yre = (int8_t) (SCALE_BYTE_CONV_QAM16*crealf(symbols[i])); short yim = (int8_t) (SCALE_BYTE_CONV_QAM16*cimagf(symbols[i])); llr[4*i+0] = -yre; llr[4*i+1] = -yim; llr[4*i+2] = abs(yre)-2*SCALE_BYTE_CONV_QAM16/sqrt(10); llr[4*i+3] = abs(yim)-2*SCALE_BYTE_CONV_QAM16/sqrt(10); } }
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { int i; const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9, -1, 5, -1, 5, -1, 1, -1, 1); for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); const __m128i in_0g0g = _mm_shuffle_epi8(in, kCstShuffle); const __m128i out = _mm_sub_epi8(in, in_0g0g); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); }
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g const __m128i out = _mm_sub_epi8(in, C); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); }
// Predictor0: ARGB_BLACK. static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; const __m128i black = _mm_set1_epi32(ARGB_BLACK); for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); const __m128i res = _mm_sub_epi8(src, black); _mm_storeu_si128((__m128i*)&out[i], res); } if (i != num_pixels) { VP8LPredictorsSub_C[0](in + i, upper + i, num_pixels - i, out + i); } }
// Denoise a 16x1 vector. static INLINE __m128i vp9_denoiser_16x1_sse2( const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, const __m128i *k_0, const __m128i *k_4, const __m128i *k_8, const __m128i *k_16, const __m128i *l3, const __m128i *l32, const __m128i *l21, __m128i acc_diff) { // Calculate differences const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); __m128i v_running_avg_y; const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); // Obtain the sign. FF if diff is negative. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0); // Clamp absolute difference to 16 to be used to get mask. Doing this // allows us to use _mm_cmpgt_epi8, which operates on signed byte. const __m128i clamped_absdiff = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16); // Get masks for l2 l1 and l0 adjustments. const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff); const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff); const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff); // Get adjustments for l2, l1, and l0. __m128i adj2 = _mm_and_si128(mask2, *l32); const __m128i adj1 = _mm_and_si128(mask1, *l21); const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); __m128i adj, padj, nadj; // Combine the adjustments and get absolute adjustments. adj2 = _mm_add_epi8(adj2, adj1); adj = _mm_sub_epi8(*l3, adj2); adj = _mm_andnot_si128(mask0, adj); adj = _mm_or_si128(adj, adj0); // Restore the sign and get positive and negative adjustments. padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); // Calculate filtered value. v_running_avg_y = _mm_adds_epu8(v_sig, padj); v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); // Adjustments <=7, and each element in acc_diff can fit in signed // char. acc_diff = _mm_adds_epi8(acc_diff, padj); acc_diff = _mm_subs_epi8(acc_diff, nadj); return acc_diff; }
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { const __m128i mask = _mm_set1_epi32(0x0000ff00); int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|... const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|... const __m128i in_000g = _mm_srli_epi32(in_00g0, 8); // 000g|000g|... const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g); const __m128i out = _mm_sub_epi8(in, in_0g0g); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); }
static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { int x, y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]); const __m128i m_inv = _mm_sub_epi8(mask_max, m); // Calculate 16 predicted pixels. // Note that the maximum value of any entry of 'pred_l' or 'pred_r' // is 64 * 255, so we have plenty of space to add rounding constants. const __m128i data_l = _mm_unpacklo_epi8(a, b); const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi8(a, b); const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packus_epi16(pred_l, pred_r); res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. int32_t sad = _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); return (sad + 31) >> 6; }
// Predictor5: avg2(avg2(L, TR), T) static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]); const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); __m128i avg, pred, res; Average2_m128i(&L, &TR, &avg); Average2_m128i(&avg, &T, &pred); res = _mm_sub_epi8(src, pred); _mm_storeu_si128((__m128i*)&out[i], res); } if (i != num_pixels) { VP8LPredictorsSub_C[5](in + i, upper + i, num_pixels - i, out + i); } }
int exponent_sum_square_error_sse2(uint8_t *exp0, uint8_t *exp1, int ncoefs) { int i, err; int exp_error = 0; union { __m128i v; int32_t res[4]; } ures; __m128i vzero = _mm_setzero_si128(); __m128i vres = vzero; for (i = 0; i < (ncoefs & ~15); i+=16) { __m128i vexp = _mm_loadu_si128((__m128i*)&exp0[i]); __m128i vexp2 = _mm_loadu_si128((__m128i*)&exp1[i]); #if 0 //safer but needed? __m128i vexphi = _mm_unpackhi_epi8(vexp, vzero); __m128i vexp2hi = _mm_unpackhi_epi8(vexp2, vzero); __m128i vexplo = _mm_unpacklo_epi8(vexp, vzero); __m128i vexp2lo = _mm_unpacklo_epi8(vexp2, vzero); __m128i verrhi = _mm_sub_epi16(vexphi, vexp2hi); __m128i verrlo = _mm_sub_epi16(vexplo, vexp2lo); #else __m128i verr = _mm_sub_epi8(vexp, vexp2); __m128i vsign = _mm_cmplt_epi8(verr, vzero); __m128i verrhi = _mm_unpackhi_epi8(verr, vsign); __m128i verrlo = _mm_unpacklo_epi8(verr, vsign); #endif verrhi = _mm_madd_epi16(verrhi, verrhi); verrlo = _mm_madd_epi16(verrlo, verrlo); verrhi = _mm_add_epi32(verrhi, verrlo); vres = _mm_add_epi32(vres, verrhi); } _mm_store_si128(&ures.v, vres); ures.res[0]+=ures.res[1]; ures.res[2]+=ures.res[3]; exp_error += ures.res[0]+ures.res[2]; for (; i < ncoefs; ++i) { err = exp0[i] - exp1[i]; exp_error += (err * err); } return exp_error; }
// Applies filter on 4 pixels (p1, p0, q0 and q1) static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0, __m128i* const q0, __m128i* const q1, const __m128i* const mask, int hev_thresh) { const __m128i sign_bit = _mm_set1_epi8(0x80); const __m128i k64 = _mm_set1_epi8(0x40); const __m128i zero = _mm_setzero_si128(); __m128i not_hev; __m128i t1, t2, t3; // compute hev mask GetNotHEV(p1, p0, q0, q1, hev_thresh, ¬_hev); // convert to signed values FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); t1 = _mm_subs_epi8(*p1, *q1); // p1 - q1 t1 = _mm_andnot_si128(not_hev, t1); // hev(p1 - q1) t2 = _mm_subs_epi8(*q0, *p0); // q0 - p0 t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 1 * (q0 - p0) t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 2 * (q0 - p0) t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0) t1 = _mm_and_si128(t1, *mask); // mask filter values we don't care about t2 = _mm_set1_epi8(3); t3 = _mm_set1_epi8(4); t2 = _mm_adds_epi8(t1, t2); // 3 * (q0 - p0) + (p1 - q1) + 3 t3 = _mm_adds_epi8(t1, t3); // 3 * (q0 - p0) + (p1 - q1) + 4 SignedShift8b(&t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 SignedShift8b(&t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 *p0 = _mm_adds_epi8(*p0, t2); // p0 += t2 *q0 = _mm_subs_epi8(*q0, t3); // q0 -= t3 FLIP_SIGN_BIT2(*p0, *q0); // this is equivalent to signed (a + 1) >> 1 calculation t2 = _mm_add_epi8(t3, sign_bit); t3 = _mm_avg_epu8(t2, zero); t3 = _mm_sub_epi8(t3, k64); t3 = _mm_and_si128(not_hev, t3); // if !hev *q1 = _mm_subs_epi8(*q1, t3); // q1 -= t3 *p1 = _mm_adds_epi8(*p1, t3); // p1 += t3 FLIP_SIGN_BIT2(*p1, *q1); }
static INLINE unsigned int masked_sad8xh_ssse3( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y += 2) { const __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr); const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]); const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr); const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]); const __m128i m = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); const __m128i m_inv = _mm_sub_epi8(mask_max, m); const __m128i data_l = _mm_unpacklo_epi8(a0, b0); const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpacklo_epi8(a1, b1); const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packus_epi16(pred_l, pred_r); res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } int32_t sad = _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); return (sad + 31) >> 6; }
void filterScanlinesSSE( unsigned char* filtered, unsigned char* image, unsigned int WIDTH, unsigned int HEIGHT ) { int blocks = 3*WIDTH/16; // Create move-mask for last block of each scanline __m128i mask = _mm_cmplt_epi8( _mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ), _mm_set1_epi8( 3*WIDTH-16*blocks ) ); { const unsigned char* in = image; unsigned char* out = filtered; *out++ = 0; for(int b=0; b<blocks; b++ ) { _mm_storeu_si128( (__m128i*)out, _mm_lddqu_si128( (__m128i const*)in ) ); in += 16; out += 16; } _mm_maskmoveu_si128( _mm_lddqu_si128( (__m128i const*)in ), mask, (char*)out ); } for( unsigned int j=1; j<HEIGHT; j++ ) { const unsigned char* in = image + 3*WIDTH*(j-1); unsigned char* out = filtered + (3*WIDTH+1)*j; *out++ = 2; for(int b=0; b<blocks; b++ ) { __m128i _t0 = _mm_lddqu_si128( (__m128i const*)in ); __m128i _t1 = _mm_lddqu_si128( (__m128i const*)(in + 3*WIDTH ) ); _mm_storeu_si128( (__m128i*)out, _mm_sub_epi8( _t1, _t0 ) ); in += 16; out += 16; } _mm_maskmoveu_si128( _mm_lddqu_si128( (__m128i const*)in ), mask, (char*)out ); } }
static INLINE unsigned int masked_sad4xh_ssse3( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y += 2) { // Load two rows at a time, this seems to be a bit faster // than four rows at a time in this case. const __m128i src = _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(uint32_t *)src_ptr), _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride])); const __m128i a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr), _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride])); const __m128i b = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr), _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride])); const __m128i m = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr), _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride])); const __m128i m_inv = _mm_sub_epi8(mask_max, m); const __m128i data = _mm_unpacklo_epi8(a, b); const __m128i mask = _mm_unpacklo_epi8(m, m_inv); __m128i pred_16bit = _mm_maddubs_epi16(data, mask); pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128()); res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } // At this point, the SAD is stored in lane 0 of 'res' int32_t sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); __m128i pa, pb; GetSumAbsDiff32_SSE2(&T, &TL, &pa); // pa = sum |T-TL| GetSumAbsDiff32_SSE2(&L, &TL, &pb); // pb = sum |L-TL| { const __m128i mask = _mm_cmpgt_epi32(pb, pa); const __m128i A = _mm_and_si128(mask, L); const __m128i B = _mm_andnot_si128(mask, T); const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T const __m128i res = _mm_sub_epi8(src, pred); _mm_storeu_si128((__m128i*)&out[i], res); } } if (i != num_pixels) { VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i); } }
static void GradientPredictDirect(const uint8_t* const row, const uint8_t* const top, uint8_t* const out, int length) { const int max_pos = length & ~7; int i; const __m128i zero = _mm_setzero_si128(); for (i = 0; i < max_pos; i += 8) { const __m128i A0 = _mm_loadl_epi64((const __m128i*)&row[i - 1]); const __m128i B0 = _mm_loadl_epi64((const __m128i*)&top[i]); const __m128i C0 = _mm_loadl_epi64((const __m128i*)&top[i - 1]); const __m128i D = _mm_loadl_epi64((const __m128i*)&row[i]); const __m128i A1 = _mm_unpacklo_epi8(A0, zero); const __m128i B1 = _mm_unpacklo_epi8(B0, zero); const __m128i C1 = _mm_unpacklo_epi8(C0, zero); const __m128i E = _mm_add_epi16(A1, B1); const __m128i F = _mm_sub_epi16(E, C1); const __m128i G = _mm_packus_epi16(F, zero); const __m128i H = _mm_sub_epi8(D, G); _mm_storel_epi64((__m128i*)(out + i), H); } for (; i < length; ++i) { out[i] = row[i] - GradientPredictorC(row[i - 1], top[i], top[i - 1]); } }
template <> __m128i Invert<true>(__m128i value) { return _mm_sub_epi8(Sse2::K_INV_ZERO, value); }
void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression) { Mat img = _img.getMat(); const int K = patternSize/2, N = patternSize + K + 1; #if CV_SSE2 const int quarterPatternSize = patternSize/4; (void)quarterPatternSize; #endif int i, j, k, pixel[25]; makeOffsets(pixel, (int)img.step, patternSize); keypoints.clear(); threshold = std::min(std::max(threshold, 0), 255); #if CV_SSE2 __m128i delta = _mm_set1_epi8(-128), t = _mm_set1_epi8((char)threshold), K16 = _mm_set1_epi8((char)K); (void)K16; (void)delta; (void)t; #endif uchar threshold_tab[512]; for( i = -255; i <= 255; i++ ) threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0); AutoBuffer<uchar> _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128); uchar* buf[3]; buf[0] = _buf; buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols; int* cpbuf[3]; cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1; cpbuf[1] = cpbuf[0] + img.cols + 1; cpbuf[2] = cpbuf[1] + img.cols + 1; memset(buf[0], 0, img.cols*3); for(i = 3; i < img.rows-2; i++) { const uchar* ptr = img.ptr<uchar>(i) + 3; uchar* curr = buf[(i - 3)%3]; int* cornerpos = cpbuf[(i - 3)%3]; memset(curr, 0, img.cols); int ncorners = 0; if( i < img.rows - 3 ) { j = 3; #if CV_SSE2 if( patternSize == 16 ) { for(; j < img.cols - 16 - 3; j += 16, ptr += 16) { __m128i m0, m1; __m128i v0 = _mm_loadu_si128((const __m128i*)ptr); __m128i v1 = _mm_xor_si128(_mm_subs_epu8(v0, t), delta); v0 = _mm_xor_si128(_mm_adds_epu8(v0, t), delta); __m128i x0 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[0])), delta); __m128i x1 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[quarterPatternSize])), delta); __m128i x2 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[2*quarterPatternSize])), delta); __m128i x3 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[3*quarterPatternSize])), delta); m0 = _mm_and_si128(_mm_cmpgt_epi8(x0, v0), _mm_cmpgt_epi8(x1, v0)); m1 = _mm_and_si128(_mm_cmpgt_epi8(v1, x0), _mm_cmpgt_epi8(v1, x1)); m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x1, v0), _mm_cmpgt_epi8(x2, v0))); m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x1), _mm_cmpgt_epi8(v1, x2))); m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x2, v0), _mm_cmpgt_epi8(x3, v0))); m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x2), _mm_cmpgt_epi8(v1, x3))); m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x3, v0), _mm_cmpgt_epi8(x0, v0))); m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x3), _mm_cmpgt_epi8(v1, x0))); m0 = _mm_or_si128(m0, m1); int mask = _mm_movemask_epi8(m0); if( mask == 0 ) continue; if( (mask & 255) == 0 ) { j -= 8; ptr -= 8; continue; } __m128i c0 = _mm_setzero_si128(), c1 = c0, max0 = c0, max1 = c0; for( k = 0; k < N; k++ ) { __m128i x = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(ptr + pixel[k])), delta); m0 = _mm_cmpgt_epi8(x, v0); m1 = _mm_cmpgt_epi8(v1, x); c0 = _mm_and_si128(_mm_sub_epi8(c0, m0), m0); c1 = _mm_and_si128(_mm_sub_epi8(c1, m1), m1); max0 = _mm_max_epu8(max0, c0); max1 = _mm_max_epu8(max1, c1); } max0 = _mm_max_epu8(max0, max1); int m = _mm_movemask_epi8(_mm_cmpgt_epi8(max0, K16)); for( k = 0; m > 0 && k < 16; k++, m >>= 1 ) if(m & 1) { cornerpos[ncorners++] = j+k; if(nonmax_suppression) curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold); } } } #endif for( ; j < img.cols - 3; j++, ptr++ ) { int v = ptr[0]; const uchar* tab = &threshold_tab[0] - v + 255; int d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]]; if( d == 0 ) continue; d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]]; d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]]; d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]]; if( d == 0 ) continue; d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]]; d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]]; d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]]; d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]]; if( d & 1 ) { int vt = v - threshold, count = 0; for( k = 0; k < N; k++ ) { int x = ptr[pixel[k]]; if(x < vt) { if( ++count > K ) { cornerpos[ncorners++] = j; if(nonmax_suppression) curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold); break; } } else count = 0; } } if( d & 2 ) { int vt = v + threshold, count = 0; for( k = 0; k < N; k++ ) { int x = ptr[pixel[k]]; if(x > vt) { if( ++count > K ) { cornerpos[ncorners++] = j; if(nonmax_suppression) curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold); break; } } else count = 0; } } } } cornerpos[-1] = ncorners; if( i == 3 ) continue; const uchar* prev = buf[(i - 4 + 3)%3]; const uchar* pprev = buf[(i - 5 + 3)%3]; cornerpos = cpbuf[(i - 4 + 3)%3]; ncorners = cornerpos[-1]; for( k = 0; k < ncorners; k++ ) { j = cornerpos[k]; int score = prev[j]; if( !nonmax_suppression || (score > prev[j+1] && score > prev[j-1] && score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] && score > curr[j-1] && score > curr[j] && score > curr[j+1]) ) { keypoints.push_back(KeyPoint((float)j, (float)(i-1), 7.f, -1, (float)score)); } } }
}bool validate_utf8_sse(const char *src, size_t len) { const char *end = src + len; while (src + 16 < end) { __m128i chunk = _mm_loadu_si128((const __m128i *)(src)); int asciiMask = _mm_movemask_epi8(chunk); if (!asciiMask) { src += 16; continue; } __m128i chunk_signed = _mm_add_epi8(chunk, _mm_set1_epi8(0x80)); __m128i cond2 = _mm_cmplt_epi8(_mm_set1_epi8(0xc2 - 1 - 0x80), chunk_signed); __m128i state = _mm_set1_epi8((char)(0x0 | 0x80)); state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x2 | 0xc0)), cond2); __m128i cond3 = _mm_cmplt_epi8(_mm_set1_epi8(0xe0 - 1 - 0x80), chunk_signed); state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x3 | 0xe0)), cond3); __m128i mask3 = _mm_slli_si128(cond3, 1); __m128i cond4 = _mm_cmplt_epi8(_mm_set1_epi8(0xf0 - 1 - 0x80), chunk_signed); // Fall back to the scalar processing if (_mm_movemask_epi8(cond4)) { break; } __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7)); __m128i count_sub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1)); __m128i counts = _mm_add_epi8(count, _mm_slli_si128(count_sub1, 1)); __m128i shifts = count_sub1; shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1)); counts = _mm_add_epi8( counts, _mm_slli_si128(_mm_subs_epu8(counts, _mm_set1_epi8(0x2)), 2)); shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2)); if (asciiMask ^ _mm_movemask_epi8(_mm_cmpgt_epi8(counts, _mm_set1_epi8(0)))) return false; // error shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4)); if (_mm_movemask_epi8(_mm_cmpgt_epi8( _mm_sub_epi8(_mm_slli_si128(counts, 1), counts), _mm_set1_epi8(1)))) return false; // error shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8)); __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8)); shifts = _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1 chunk = _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1), _mm_srli_si128(_mm_slli_epi16(shifts, 7), 1)); __m128i chunk_right = _mm_slli_si128(chunk, 1); __m128i chunk_low = _mm_blendv_epi8( chunk, _mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6), _mm_set1_epi8(0xc0))), _mm_cmpeq_epi8(counts, _mm_set1_epi8(1))); __m128i chunk_high = _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2))); shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2), _mm_srli_si128(_mm_slli_epi16(shifts, 6), 2)); chunk_high = _mm_srli_epi32(chunk_high, 2); shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4), _mm_srli_si128(_mm_slli_epi16(shifts, 5), 4)); chunk_high = _mm_or_si128( chunk_high, _mm_and_si128(_mm_and_si128(_mm_slli_epi32(chunk_right, 4), _mm_set1_epi8(0xf0)), mask3)); int c = _mm_extract_epi16(counts, 7); int source_advance = !(c & 0x0200) ? 16 : !(c & 0x02) ? 15 : 14; __m128i high_bits = _mm_and_si128(chunk_high, _mm_set1_epi8(0xf8)); if (!_mm_testz_si128( mask3, _mm_or_si128(_mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0x00)), _mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0xd8))))) return false; shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 8), _mm_srli_si128(_mm_slli_epi16(shifts, 4), 8)); chunk_high = _mm_slli_si128(chunk_high, 1); __m128i shuf = _mm_add_epi8(shifts, _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); chunk_low = _mm_shuffle_epi8(chunk_low, shuf); chunk_high = _mm_shuffle_epi8(chunk_high, shuf); __m128i utf16_low = _mm_unpacklo_epi8(chunk_low, chunk_high); __m128i utf16_high = _mm_unpackhi_epi8(chunk_low, chunk_high); if (_mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_high, 8, _SIDD_UWORD_OPS | _SIDD_CMP_RANGES) | _mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_low, 8, _SIDD_UWORD_OPS | _SIDD_CMP_RANGES)) { return false; } src += source_advance; } return validate_utf8(src, end - src); }
/* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */ static inline int16_t hMax(__m128i buffer) { __m128i tmp1 = _mm_sub_epi8(_mm_set1_epi16(0x7FFF), buffer); __m128i tmp3 = _mm_minpos_epu16(tmp1); return (int16_t)(_mm_cvtsi128_si32(tmp3)); }
void demod_64qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols) { float *symbolsPtr = (float*) symbols; __m128i *resultPtr = (__m128i*) llr; __m128 symbol1, symbol2, symbol3, symbol4; __m128i symbol_i1, symbol_i2, symbol_i3, symbol_i4, symbol_i, symbol_abs, symbol_abs2,symbol_12, symbol_34; __m128i offset1 = _mm_set1_epi8(4*SCALE_BYTE_CONV_QAM64/sqrt(42)); __m128i offset2 = _mm_set1_epi8(2*SCALE_BYTE_CONV_QAM64/sqrt(42)); __m128 scale_v = _mm_set1_ps(-SCALE_BYTE_CONV_QAM64); __m128i result11, result12, result13, result22, result21,result23, result31, result32, result33; __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,5,4,0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0); __m128i shuffle_negated_2 = _mm_set_epi8(11,10,0xff,0xff,0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff); __m128i shuffle_negated_3 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff); __m128i shuffle_abs_1 = _mm_set_epi8(5,4,0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0,0xff,0xff); __m128i shuffle_abs_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff,0xff,0xff); __m128i shuffle_abs_3 = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff,11,10); __m128i shuffle_abs2_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0,0xff,0xff,0xff,0xff); __m128i shuffle_abs2_2 = _mm_set_epi8(0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff,0xff,0xff,5,4); __m128i shuffle_abs2_3 = _mm_set_epi8(15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff,11,10,0xff,0xff); for (int i=0;i<nsymbols/8;i++) { symbol1 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol2 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol3 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol4 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v)); symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v)); symbol_i3 = _mm_cvtps_epi32(_mm_mul_ps(symbol3, scale_v)); symbol_i4 = _mm_cvtps_epi32(_mm_mul_ps(symbol4, scale_v)); symbol_12 = _mm_packs_epi32(symbol_i1, symbol_i2); symbol_34 = _mm_packs_epi32(symbol_i3, symbol_i4); symbol_i = _mm_packs_epi16(symbol_12, symbol_34); symbol_abs = _mm_abs_epi8(symbol_i); symbol_abs = _mm_sub_epi8(symbol_abs, offset1); symbol_abs2 = _mm_sub_epi8(_mm_abs_epi8(symbol_abs), offset2); result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1); result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1); result13 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_1); result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2); result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2); result23 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_2); result31 = _mm_shuffle_epi8(symbol_i, shuffle_negated_3); result32 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_3); result33 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_3); _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result11, result12),result13)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result21, result22),result23)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result31, result32),result33)); resultPtr++; } for (int i=8*(nsymbols/8);i<nsymbols;i++) { float yre = (int8_t) (SCALE_BYTE_CONV_QAM64*crealf(symbols[i])); float yim = (int8_t) (SCALE_BYTE_CONV_QAM64*cimagf(symbols[i])); llr[6*i+0] = -yre; llr[6*i+1] = -yim; llr[6*i+2] = abs(yre)-4*SCALE_BYTE_CONV_QAM64/sqrt(42); llr[6*i+3] = abs(yim)-4*SCALE_BYTE_CONV_QAM64/sqrt(42); llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_BYTE_CONV_QAM64/sqrt(42); llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_BYTE_CONV_QAM64/sqrt(42); } }
void Sobel::sobelSSE(const Image1D& srcImage, SobelImage& destImage) { ASSERT(srcImage.width % 16 == 0); ASSERT(srcImage.height >= 3); ASSERT(srcImage.yStart >= 0); ASSERT(srcImage.yStart <= srcImage.height); destImage.setResolution(srcImage.width, srcImage.height); destImage.yStart = srcImage.yStart; if(srcImage.yStart >= srcImage.height) return; // a b c 0 1 2 // d e f 3 4 5 // g h i 6 7 8 __m128i valA, valB, valC, valD, valF, valG, valH, valI; __m128i sumX; __m128i sumY; __m128i tmp; __m128i zeros = _mm_setzero_si128(); __m128i* pDestImg; __m128i* pDestImgLineEnd; // Fill top line for(pDestImg = reinterpret_cast<__m128i*>(destImage[destImage.yStart]), pDestImgLineEnd = reinterpret_cast<__m128i*>(destImage[destImage.yStart + 1]); pDestImg < pDestImgLineEnd; ++pDestImg) { *pDestImg = zeros; } int lastRow = destImage.height - 1; const Image1D::Pixel* p0 = srcImage[destImage.yStart]; const Image1D::Pixel* p1 = srcImage[destImage.yStart + 1]; const Image1D::Pixel* p2 = srcImage[destImage.yStart + 2]; const Image1D::Pixel* p0LineEnd; for(int y = destImage.yStart + 1; y < lastRow; ++y) { for(p0LineEnd = srcImage[y], pDestImg = reinterpret_cast<__m128i*>(destImage[y]); p0 < p0LineEnd; p0 += 16, p1 += 16, p2 += 16, pDestImg += 2) { // laod values valA = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p0 - 1)); valB = _mm_load_si128(reinterpret_cast<const __m128i*>(p0)); valC = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p0 + 1)); valD = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p1 - 1)); valF = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p1 + 1)); valG = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p2 - 1)); valH = _mm_load_si128(reinterpret_cast<const __m128i*>(p2)); valI = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p2 + 1)); sumX = _mm_avg_epu8(valA, valG); // sumX = (a + g) / 2 sumX = _mm_avg_epu8(sumX, valD); // sumX = (sumX + d) / 2 sumX = _mm_avg_epu8(sumX, zeros); // sumX = sumX / 2 with average, because there is no 8 bit shift tmp = _mm_avg_epu8(valC, valI); // tmp = (c + i) / 2 tmp = _mm_avg_epu8(tmp, valF); // tnp = (tmp + f) / 2 tmp = _mm_avg_epu8(tmp, zeros); // tnp = tnp / 2 with average, because there is no 8 bit shift sumX = _mm_sub_epi8(sumX, tmp); sumY = _mm_avg_epu8(valA, valC); // sumX = (a + c) / 2 sumY = _mm_avg_epu8(sumY, valB); // sumX = (sumX + b) / 2 sumY = _mm_avg_epu8(sumY, zeros); // sumX = sumX / 2 with average, because there is no 8 bit shift tmp = _mm_avg_epu8(valG, valI); // tmp = (g + i) / 2 tmp = _mm_avg_epu8(tmp, valH); // tnp = (tmp + h) / 2 tmp = _mm_avg_epu8(tmp, zeros); // tnp = tnp / 2 with average, because there is no 8 bit shift sumY = _mm_sub_epi8(sumY, tmp); *pDestImg = _mm_unpacklo_epi8(sumX, sumY); *(pDestImg + 1) = _mm_unpackhi_epi8(sumX, sumY); } } // Fill bottom line for(pDestImg = reinterpret_cast<__m128i*>(destImage[destImage.height - 1]), pDestImgLineEnd = reinterpret_cast<__m128i*>(destImage[destImage.height]); pDestImg < pDestImgLineEnd; ++pDestImg) { *pDestImg = zeros; } // Fill right and left border for(int y = destImage.yStart; y < destImage.height - 1; y++) { destImage[y]->index = 0; (destImage[y + 1] - 1)->index = 0; } }
mlib_status __mlib_VectorSumAbsDiff_S8_Sat( mlib_d64 *z, const mlib_s8 *x, const mlib_s8 *y, mlib_s32 n) { if (n <= 0) return (MLIB_FAILURE); mlib_s32 i, nstep, ax, ay, n1, n2, n3, diff, sum = 0; mlib_s8 *px = (mlib_s8 *)x, *py = (mlib_s8 *)y; __m128i zero, xbuf, ybuf, zbuf, mext, mbuf; zero = _mm_setzero_si128(); zbuf = zero; nstep = 16 / sizeof (mlib_s8); ax = (mlib_addr)x & 15; ay = (mlib_addr)y & 15; n1 = ((16 - ax) & 15) / sizeof (mlib_s8); n2 = (n - n1) / nstep; n3 = n - n1 - n2 * nstep; if (n2 < 1) { for (i = 0; i < n; i++) { diff = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(diff); } *z = sum; } else { for (i = 0; i < n1; i++) { diff = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(diff); } if (ax == ay) { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_load_si128((__m128i *)py); mext = _mm_cmpgt_epi8(ybuf, xbuf); mbuf = _mm_sub_epi8(xbuf, ybuf); mbuf = _mm_xor_si128(mbuf, mext); mbuf = _mm_sub_epi8(mbuf, mext); mbuf = _mm_sad_epu8(mbuf, zero); zbuf = _mm_add_epi64(zbuf, mbuf); px += nstep; py += nstep; } } else { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_loadu_si128((__m128i *)py); mext = _mm_cmpgt_epi8(ybuf, xbuf); mbuf = _mm_sub_epi8(xbuf, ybuf); mbuf = _mm_xor_si128(mbuf, mext); mbuf = _mm_sub_epi8(mbuf, mext); mbuf = _mm_sad_epu8(mbuf, zero); zbuf = _mm_add_epi64(zbuf, mbuf); px += nstep; py += nstep; } } for (i = 0; i < n3; i++) { diff = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(diff); } mlib_d64 dsum = sum; long long pz[2]; _mm_storeu_si128((__m128i *)pz, zbuf); dsum += pz[0]; dsum += pz[1]; *z = dsum; } return (MLIB_SUCCESS); }
EB_ERRORTYPE GatherSaoStatisticsLcu_OnlyEo_90_45_135_16bit_SSE2_INTRIN( EB_U16 *inputSamplePtr, // input parameter, source Picture Ptr EB_U32 inputStride, // input parameter, source stride EB_U16 *reconSamplePtr, // input parameter, deblocked Picture Ptr EB_U32 reconStride, // input parameter, deblocked stride EB_U32 lcuWidth, // input parameter, LCU width EB_U32 lcuHeight, // input parameter, LCU height EB_S32 eoDiff[SAO_EO_TYPES][SAO_EO_CATEGORIES + 1], // output parameter, used to store Edge Offset diff, eoDiff[SAO_EO_TYPES] [SAO_EO_CATEGORIES] EB_U16 eoCount[SAO_EO_TYPES][SAO_EO_CATEGORIES + 1]) // output parameter, used to store Edge Offset count, eoCount[SAO_EO_TYPES] [SAO_EO_CATEGORIES] // output parameter, used to store Edge Offset count, eoCount[SAO_EO_TYPES] [SAO_EO_CATEGORIES] { #define boShift 5 EB_ERRORTYPE return_error = EB_ErrorNone; EB_U64 count_x, count_y; EB_S32 diff; __m128i xmm0, xmm_1, xmm_N1, xmm_N3, xmm_N4, xmm_skip_mask, xmm9, xmm10, xmm11, xmm12, xmm13, xmm15; __m128i xmm_temp_input1, xmm_temp_input2, xmm_temp_recon1, xmm_temp_recon2, xmm_diff1, xmm_diff2; __m128i xmm_sign_1, xmm_sign_1a, xmm_sign_1b, xmm_sign_2a, xmm_sign_2b, xmm_sign_2, xmm_eoIndex; xmm0 = _mm_setzero_si128(); xmm12 = _mm_setzero_si128(); xmm15 = _mm_set1_epi16(0x0001); xmm_N1 = _mm_set1_epi8((signed char)0xFF); xmm_N3 = _mm_set1_epi8((signed char)0xFD); xmm_N4 = _mm_set1_epi8((signed char)0xFC); xmm_1 = _mm_sub_epi8(xmm0, xmm_N1); // Initialize SAO Arrays EB_ALIGN(16) EB_S8 rTemp[512] = { 0 }; EB_U64 reconStrideTemp; lcuHeight -= 2; inputSamplePtr += inputStride + 1; reconSamplePtr++; if (lcuWidth == 16) { xmm_skip_mask = _mm_srli_si128(xmm_N1, 2); for (count_y = 0; count_y < lcuHeight; ++count_y) { xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); xmm_diff2 = _mm_slli_si128(xmm_diff2, 4); //skip last 2 samples xmm_diff2 = _mm_srli_si128(xmm_diff2, 4); //skip last 2 samples // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr+2*reconStride) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr-1, reconSamplePtr+2*reconStride+1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr+1, reconSamplePtr+2*reconStride-1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += inputStride; reconSamplePtr += reconStride; } lcuWidth = 2; } else if (lcuWidth == 28) { xmm_skip_mask = _mm_srli_si128(xmm_N1, 6); for (count_y = 0; count_y < lcuHeight; ++count_y) { //----------- 0-15 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr+2*reconStride) MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr-1, reconSamplePtr+2*reconStride+1) MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr+1, reconSamplePtr+2*reconStride-1) MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) //----------- 16-25 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 16)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 24)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 16)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 24)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); xmm_diff2 = _mm_slli_si128(xmm_diff2, 12); //skip last 6 samples xmm_diff2 = _mm_srli_si128(xmm_diff2, 12); //skip last 6 samples // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr+16, reconSamplePtr+2*reconStride+16) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 6 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr+15, reconSamplePtr+2*reconStride+17) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 6 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr+17, reconSamplePtr+2*reconStride+15) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 6 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += inputStride; reconSamplePtr += reconStride; } lcuWidth = 6; } else if (lcuWidth == 56) { xmm_skip_mask = _mm_srli_si128(xmm_N1, 10); lcuWidth -= 8; inputStride -= lcuWidth; reconStrideTemp = reconStride - lcuWidth; for (count_y = 0; count_y < lcuHeight; ++count_y) { for (count_x = 0; count_x < lcuWidth; count_x += 16) { //----------- 0-15 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr + 2 * reconStride) MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr - 1, reconSamplePtr + 2 * reconStride + 1) MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr + 1, reconSamplePtr + 2 * reconStride - 1) MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += 16; reconSamplePtr += 16; } //----------- 48-53 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff1 = _mm_slli_si128(xmm_diff1, 4); //skip last 10 samples xmm_diff1 = _mm_srli_si128(xmm_diff1, 4); //skip last 10 samples // EO-90 MACRO_CALC_EO_INDEX_HALF(reconSamplePtr, reconSamplePtr+2*reconStride) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 10 samples MACRO_GATHER_EO_HALF(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX_HALF(reconSamplePtr-1, reconSamplePtr+2*reconStride+1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 10 samples MACRO_GATHER_EO_HALF(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX_HALF(reconSamplePtr+1, reconSamplePtr+2*reconStride-1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 10 samples MACRO_GATHER_EO_HALF(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += inputStride; reconSamplePtr += reconStrideTemp; } lcuWidth = 10; } else { lcuWidth -= 16; inputStride -= lcuWidth; reconStrideTemp = reconStride - lcuWidth; xmm_skip_mask = _mm_srli_si128(xmm_N1, 2); for (count_y = 0; count_y < lcuHeight; ++count_y) { for (count_x = 0; count_x < lcuWidth; count_x += 16) { //----------- 0-15 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); //EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr + 2 * reconStride) MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) //EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr - 1, reconSamplePtr + 2 * reconStride + 1) MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) //EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr + 1, reconSamplePtr + 2 * reconStride - 1) MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += 16; reconSamplePtr += 16; } //----------- 48-61 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); xmm_diff2 = _mm_slli_si128(xmm_diff2, 4); //skip last 2 samples xmm_diff2 = _mm_srli_si128(xmm_diff2, 4); //skip last 2 samples // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr+2*reconStride) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr-1, reconSamplePtr+2*reconStride+1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr+1, reconSamplePtr+2*reconStride-1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += inputStride; reconSamplePtr += reconStrideTemp; } lcuWidth = 2; } lcuWidth = (EB_U16)lcuWidth * (EB_U16)lcuHeight; MACRO_SAVE_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1, 1) MACRO_SAVE_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2, 2) MACRO_SAVE_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3, 3) return return_error; }
void vec_i8_cnt_dosage2(const int8_t *p, int8_t *out, size_t n, int8_t val, int8_t missing, int8_t missing_substitute) { #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)out & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p+=2) { *out ++ = ((p[0] == missing) || (p[1] == missing)) ? missing_substitute : (p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0); } // body, SSE2 const __m128i val16 = _mm_set1_epi8(val); const __m128i miss16 = _mm_set1_epi8(missing); const __m128i sub16 = _mm_set1_epi8(missing_substitute); const __m128i mask = _mm_set1_epi16(0x00FF); # ifdef COREARRAY_SIMD_AVX2 // header 2, 32-byte aligned if ((n >= 16) && ((size_t)out & 0x10)) { __m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask)); __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8)); __m128i c = _mm_setzero_si128(); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16)); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16)); w1 = _mm_cmpeq_epi8(v1, miss16); w2 = _mm_cmpeq_epi8(v2, miss16); __m128i w = _mm_or_si128(w1, w2); c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c)); _mm_store_si128((__m128i *)out, c); n -= 16; out += 16; } const __m256i val32 = _mm256_set1_epi8(val); const __m256i miss32 = _mm256_set1_epi8(missing); const __m256i sub32 = _mm256_set1_epi8(missing_substitute); const __m256i mask2 = _mm256_set1_epi16(0x00FF); for (; n >= 32; n-=32) { __m256i w1 = MM_LOADU_256((__m256i const*)p); p += 32; __m256i w2 = MM_LOADU_256((__m256i const*)p); p += 32; __m256i v1 = _mm256_packus_epi16(_mm256_and_si256(w1, mask2), _mm256_and_si256(w2, mask2)); __m256i v2 = _mm256_packus_epi16(_mm256_srli_epi16(w1, 8), _mm256_srli_epi16(w2, 8)); __m256i c = _mm256_setzero_si256(); c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v1, val32)); c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v2, val32)); w1 = _mm256_cmpeq_epi8(v1, miss32); w2 = _mm256_cmpeq_epi8(v2, miss32); __m256i w = _mm256_or_si256(w1, w2); c = _mm256_or_si256(_mm256_and_si256(w, sub32), _mm256_andnot_si256(w, c)); c = _mm256_permute4x64_epi64(c, 0xD8); _mm256_store_si256((__m256i *)out, c); out += 32; } # endif // SSE2 only for (; n >= 16; n-=16) { __m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask)); __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8)); __m128i c = _mm_setzero_si128(); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16)); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16)); w1 = _mm_cmpeq_epi8(v1, miss16); w2 = _mm_cmpeq_epi8(v2, miss16); __m128i w = _mm_or_si128(w1, w2); c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c)); _mm_store_si128((__m128i *)out, c); out += 16; } #endif // tail for (; n > 0; n--, p+=2) { *out ++ = ((p[0] == missing) || (p[1] == missing)) ? missing_substitute : (p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0); } }
// this function performs precise calculations void PreOver_SSE2(void* dest, const void* source1, const void* source2, size_t size) { static const size_t stride = sizeof(__m128i)*4; static const u32 PSD = 64; static const __m128i round = _mm_set1_epi16(128); static const __m128i lomask = _mm_set1_epi32(0x00FF00FF); assert(source1 != NULL && source2 != NULL && dest != NULL); assert(size % stride == 0); const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1); const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i d, s, a, rb, ag, t; // TODO: dynamic prefetch schedluing distance? needs to be optimized (R.N) for(size_t k = 0, length = size/stride; k < length; ++k) { // TODO: put prefetch between calculations?(R.N) _mm_prefetch(reinterpret_cast<const s8*>(source128_1+PSD), _MM_HINT_NTA); _mm_prefetch(reinterpret_cast<const s8*>(source128_2+PSD), _MM_HINT_NTA); // work on entire cacheline before next prefetch for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2) { // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ // TODO: load entire cacheline at the same time? are there enough registers? 32 bit mode (special compile for 64bit?) (R.N) s = _mm_load_si128(source128_1); // AABGGRR d = _mm_load_si128(source128_2); // AABGGRR // PRELERP(S, D) = S+D - ((S*D[A]+0x80)>>8)+(S*D[A]+0x80))>>8 // T = S*D[A]+0x80 => PRELERP(S,D) = S+D - ((T>>8)+T)>>8 // set alpha to lo16 from dest_ a = _mm_srli_epi32(d, 24); // 000000AA rb = _mm_slli_epi32(a, 16); // 00AA0000 a = _mm_or_si128(rb, a); // 00AA00AA rb = _mm_and_si128(lomask, s); // 00BB00RR rb = _mm_mullo_epi16(rb, a); // BBBBRRRR rb = _mm_add_epi16(rb, round); // BBBBRRRR t = _mm_srli_epi16(rb, 8); // 00BB00RR t = _mm_add_epi16(t, rb); rb = _mm_srli_epi16(t, 8); ag = _mm_srli_epi16(s, 8); // 00AA00GG ag = _mm_mullo_epi16(ag, a); // AAAAGGGG ag = _mm_add_epi16(ag, round); t = _mm_srli_epi16(ag, 8); t = _mm_add_epi16(t, ag); ag = _mm_andnot_si128(lomask, t); // AA00GG00 rb = _mm_or_si128(rb, ag); // AABGGRR pack rb = _mm_sub_epi8(s, rb); // sub S-[(D[A]*S)/255] d = _mm_add_epi8(d, rb); // add D+[S-(D[A]*S)/255] _mm_store_si128(dest128, d); } } }
test (__m128i s1, __m128i s2) { return _mm_sub_epi8 (s1, s2); }
void alphaBlendSSE_8u(Mat& src1, Mat& src2, Mat& alpha, Mat& dest) { if(dest.empty())dest.create(src1.size(),CV_8U); const int imsize = (src1.size().area()/16); uchar* s1 = src1.data; uchar* s2 = src2.data; uchar* a = alpha.data; uchar* d = dest.data; const __m128i zero = _mm_setzero_si128(); const __m128i amax = _mm_set1_epi8(char(255)); int i=0; if(s1==d) { for(;i<imsize;++i) { __m128i ms1h = _mm_load_si128((__m128i*)(s1)); __m128i ms2h = _mm_load_si128((__m128i*)(s2)); __m128i mah = _mm_load_si128((__m128i*)(a)); __m128i imah = _mm_sub_epi8(amax,mah); __m128i ms1l = _mm_unpacklo_epi8(ms1h, zero); ms1h = _mm_unpackhi_epi8(ms1h, zero); __m128i ms2l = _mm_unpacklo_epi8(ms2h, zero); ms2h = _mm_unpackhi_epi8(ms2h, zero); __m128i mal = _mm_unpacklo_epi8(mah, zero); mah = _mm_unpackhi_epi8(mah, zero); __m128i imal = _mm_unpacklo_epi8(imah, zero); imah = _mm_unpackhi_epi8(imah, zero); ms1l = _mm_mullo_epi16(ms1l,mal); ms2l = _mm_mullo_epi16(ms2l,imal); ms1l = _mm_add_epi16(ms1l,ms2l); //ms1l = _mm_srli_epi16(ms1l,8); ms1l = _mm_srai_epi16(ms1l,8); ms1h = _mm_mullo_epi16(ms1h,mah); ms2h = _mm_mullo_epi16(ms2h,imah); ms1h = _mm_add_epi16(ms1h,ms2h); //ms1h = _mm_srli_epi16(ms1h,8); ms1h = _mm_srai_epi16(ms1h,8); _mm_stream_si128((__m128i*)s1,_mm_packs_epi16(ms1l,ms1h)); s1+=16; s2+=16; a+=16; } } else { for(;i<imsize;++i) { __m128i ms1h = _mm_load_si128((__m128i*)(s1)); __m128i ms2h = _mm_load_si128((__m128i*)(s2)); __m128i mah = _mm_load_si128((__m128i*)(a)); __m128i imah = _mm_sub_epi8(amax,mah); __m128i ms1l = _mm_unpacklo_epi8(ms1h, zero); ms1h = _mm_unpackhi_epi8(ms1h, zero); __m128i ms2l = _mm_unpacklo_epi8(ms2h, zero); ms2h = _mm_unpackhi_epi8(ms2h, zero); __m128i mal = _mm_unpacklo_epi8(mah, zero); mah = _mm_unpackhi_epi8(mah, zero); __m128i imal = _mm_unpacklo_epi8(imah, zero); imah = _mm_unpackhi_epi8(imah, zero); ms1l = _mm_mullo_epi16(ms1l,mal); ms2l = _mm_mullo_epi16(ms2l,imal); ms1l = _mm_add_epi16(ms1l,ms2l); //ms1l = _mm_srli_epi16(ms1l,8); ms1l = _mm_srai_epi16(ms1l,8); ms1h = _mm_mullo_epi16(ms1h,mah); ms2h = _mm_mullo_epi16(ms2h,imah); ms1h = _mm_add_epi16(ms1h,ms2h); //ms1h = _mm_srli_epi16(ms1h,8); ms1h = _mm_srai_epi16(ms1h,8); _mm_store_si128((__m128i*)d,_mm_packs_epi16(ms1l,ms1h)); s1+=16; s2+=16; a+=16; d+=16; } } { uchar* s1 = src1.data; uchar* s2 = src2.data; uchar* a = alpha.data; uchar* d = dest.data; for(int n=i*16;n<src1.size().area();n++) { d[n] = (a[n]*s1[n] + (255-a[n])*s2[n])>>8; } } }
/** * Calculate output of given chromosome and inputs using SSE instructions * @param chr * @param inputs * @param outputs */ void cgp_get_output_sse(ga_chr_t chromosome, __m128i_aligned inputs[CGP_INPUTS], __m128i_aligned outputs[CGP_OUTPUTS]) { #ifdef SSE2 assert(CGP_OUTPUTS == 1); assert(CGP_ROWS == 4); assert(CGP_LBACK == 1); // previous and currently computed column register __m128i prev0, prev1, prev2, prev3; register __m128i current0, current1, current2, current3; // 0xFF constant static __m128i_aligned FF; FF = _mm_set1_epi8(0xFF); cgp_genome_t genome = (cgp_genome_t) chromosome->genome; /* if primary output is connected to primary input, skip evaluation This cannot happen - CGP does not generate circuits like that if (genome->outputs[0] < CGP_INPUTS) { int i = genome->outputs[0]; _mm_store_si128(&outputs[0], inputs[i]); return; } */ #ifdef TEST_EVAL_SSE2 for (int i = 0; i < CGP_INPUTS; i++) { unsigned char *_tmp = (unsigned char*) &inputs[i]; printf("I: %2d = " UCFMT16 "\n", i, UCVAL16(0)); } #endif int offset = -CGP_ROWS; for (int x = 0; x < CGP_COLS; x++) { for (int y = 0; y < CGP_ROWS; y++) { int idx = cgp_node_index(x, y); cgp_node_t *n = &(genome->nodes[idx]); // skip inactive blocks if (!n->is_active) continue; register __m128i A; register __m128i B; register __m128i Y; register __m128i TMP; register __m128i mask; LOAD_INPUT(A, n->inputs[0]); LOAD_INPUT(B, n->inputs[1]); switch (n->function) { case c255: Y = FF; break; case identity: Y = A; break; case inversion: Y = _mm_sub_epi8(FF, A); break; case b_or: Y = _mm_or_si128(A, B); break; case b_not1or2: // we don't have NOT instruction, we need to XOR with FF Y = _mm_xor_si128(FF, A); Y = _mm_or_si128(Y, B); break; case b_and: Y = _mm_and_si128(A, B); break; case b_nand: Y = _mm_and_si128(A, B); Y = _mm_xor_si128(FF, Y); break; case b_xor: Y = _mm_xor_si128(A, B); break; case rshift1: // no SR instruction for 8bit data, we need to shift // 16 bits and apply mask // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H] // SHR: [ 0 1 2 3 4 5 6 7 | 8 A B C D E F G] // MSK: [ 0 1 2 3 4 5 6 7 | 0 A B C D E F G] mask = _mm_set1_epi8(0x7F); Y = _mm_srli_epi16(A, 1); Y = _mm_and_si128(Y, mask); break; case rshift2: // similar to rshift1 // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H] // SHR: [ 0 0 1 2 3 4 5 6 | 7 8 A B C D E F] // MSK: [ 0 0 1 2 3 4 5 6 | 0 0 A B C D E F] mask = _mm_set1_epi8(0x3F); Y = _mm_srli_epi16(A, 2); Y = _mm_and_si128(Y, mask); break; case swap: // SWAP(A, B) (((A & 0x0F) << 4) | ((B & 0x0F))) // Shift A left by 4 bits // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H] // SHL: [ 5 6 7 8 A B C D | E F G H 0 0 0 0] // MSK: [ 5 6 7 8 0 0 0 0 | E F G H 0 0 0 0] mask = _mm_set1_epi8(0xF0); TMP = _mm_slli_epi16(A, 4); TMP = _mm_and_si128(TMP, mask); // Mask B // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H] // MSK: [ 0 0 0 0 5 6 7 8 | 0 0 0 0 E F G H] mask = _mm_set1_epi8(0x0F); Y = _mm_and_si128(B, mask); // Combine Y = _mm_or_si128(Y, TMP); break; case add: Y = _mm_add_epi8(A, B); break; case add_sat: Y = _mm_adds_epu8(A, B); break; case avg: // shift right first, then add, to avoid overflow mask = _mm_set1_epi8(0x7F); TMP = _mm_srli_epi16(A, 1); TMP = _mm_and_si128(TMP, mask); Y = _mm_srli_epi16(B, 1); Y = _mm_and_si128(Y, mask); Y = _mm_add_epi8(Y, TMP); break; case max: Y = _mm_max_epu8(A, B); break; case min: Y = _mm_min_epu8(A, B); break; } #ifdef TEST_EVAL_SSE2 __m128i _tmpval = Y; unsigned char *_tmp = (unsigned char*) &_tmpval; printf("N: %2d = " UCFMT16 "\n", idx + CGP_INPUTS, UCVAL16(0)); bool mismatch = false; for (int i = 1; i < 16; i++) { if (_tmp[i] != _tmp[0]) { fprintf(stderr, "Value mismatch on index %2d (%u instead of %u)\n", i, _tmp[i], _tmp[0]); mismatch = true; } } if (mismatch) { abort(); } #endif if (idx + CGP_INPUTS == genome->outputs[0]) { _mm_store_si128(&outputs[0], Y); #ifndef TEST_EVAL_SSE2 return; #endif } ASSIGN_CURRENT(y, Y); } // end of column offset += CGP_ROWS; prev0 = current0; prev1 = current1; prev2 = current2; prev3 = current3; } // end of row #ifdef TEST_EVAL_SSE2 for (int i = 0; i < CGP_OUTPUTS; i++) { unsigned char *_tmp = (unsigned char*) &outputs[i]; printf("O: %2d = " UCFMT16 "\n", i, UCVAL16(0)); } #endif #endif }