static int SSE8x8(const uint8_t* a, const uint8_t* b) { const __m128i zero = _mm_setzero_si128(); int num_pairs = 4; __m128i sum = zero; int32_t tmp[4]; while (num_pairs-- > 0) { const __m128i a0 = LOAD_8x16b(&a[BPS * 0]); const __m128i a1 = LOAD_8x16b(&a[BPS * 1]); const __m128i b0 = LOAD_8x16b(&b[BPS * 0]); const __m128i b1 = LOAD_8x16b(&b[BPS * 1]); // subtract const __m128i c0 = _mm_subs_epi16(a0, b0); const __m128i c1 = _mm_subs_epi16(a1, b1); // multiply/accumulate with self const __m128i d0 = _mm_madd_epi16(c0, c0); const __m128i d1 = _mm_madd_epi16(c1, c1); // collect const __m128i sum01 = _mm_add_epi32(d0, d1); sum = _mm_add_epi32(sum, sum01); a += 2 * BPS; b += 2 * BPS; } _mm_storeu_si128((__m128i*)tmp, sum); return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); }
static inline void inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len, const gint16 * icoeff, gint bstride) { gint i; __m128i sum, t; sum = _mm_setzero_si128 (); for (i = 0; i < len; i += 16) { t = _mm_loadu_si128 ((__m128i *) (a + i)); sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i + 0)))); t = _mm_loadu_si128 ((__m128i *) (a + i + 8)); sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i + 8)))); } sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3))); sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1))); sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); sum = _mm_srai_epi32 (sum, PRECISION_S16); sum = _mm_packs_epi32 (sum, sum); *o = _mm_extract_epi16 (sum, 0); }
int operator() (const uchar * ptr, int len, int & x0, int & x1, int & x2, int & x3) { int x = 0; if( useSIMD ) { __m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); __m128i dx = _mm_set1_epi16(8); __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init; for( ; x <= len - 8; x += 8 ) { __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z); __m128i sx = _mm_mullo_epi16(qx, qx); qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx)); qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx)); qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx)); qx = _mm_add_epi16(qx, dx); } _mm_store_si128((__m128i*)buf, qx0); x0 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx1); x1 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx2); x2 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx3); x3 = buf[0] + buf[1] + buf[2] + buf[3]; } return x; }
SIMD_INLINE void InterpolateX1(const __m128i * alpha, __m128i * buffer) { __m128i src = _mm_load_si128(buffer); __m128i lo = _mm_madd_epi16(_mm_unpacklo_epi8(src, K_ZERO), _mm_load_si128(alpha + 0)); __m128i hi = _mm_madd_epi16(_mm_unpackhi_epi8(src, K_ZERO), _mm_load_si128(alpha + 1)); _mm_store_si128(buffer, _mm_packs_epi32(lo, hi)); }
void interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap, gint len, const gpointer icp, gint astride) { gint i = 0; gint16 *o = op, *a = ap, *ic = icp; __m128i ta, tb, t1, t2; __m128i f = _mm_set_epi64x (0, *((gint64 *) ic)); const gint16 *c[2] = { (gint16 *) ((gint8 *) a + 0 * astride), (gint16 *) ((gint8 *) a + 1 * astride) }; f = _mm_unpacklo_epi32 (f, f); f = _mm_unpacklo_epi64 (f, f); for (; i < len; i += 8) { ta = _mm_load_si128 ((__m128i *) (c[0] + i)); tb = _mm_load_si128 ((__m128i *) (c[1] + i)); t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f); t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f); t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); t1 = _mm_srai_epi32 (t1, PRECISION_S16); t2 = _mm_srai_epi32 (t2, PRECISION_S16); t1 = _mm_packs_epi32 (t1, t2); _mm_store_si128 ((__m128i *) (o + i), t1); } }
static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) { const __m128i v_val_0_w = _mm_loadl_epi64((const __m128i *)(src + 0 * stride)); const __m128i v_val_1_w = _mm_loadl_epi64((const __m128i *)(src + 1 * stride)); const __m128i v_val_2_w = _mm_loadl_epi64((const __m128i *)(src + 2 * stride)); const __m128i v_val_3_w = _mm_loadl_epi64((const __m128i *)(src + 3 * stride)); const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); return (uint64_t)_mm_cvtsi128_si32(v_sum_d); }
// // multiplies two complex vectors and returns the real and imaginary parts // as two 32 bit integers. // FORCE_INLINE int __ext_v_conj_mul_complex16_int32(int32* re, int lenout1, int32* im, int lenout2, struct complex16* x, int len1, struct complex16* y, int len2 ) { const unum8 wlen = 4;// sizeof(vcs) / sizeof(complex16); const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000); const __m128i xmm4 = _mm_set1_epi32(0x00010000); __m128i* Xs = (__m128i*) x; __m128i* Ys = (__m128i*) y; __m128i* Res = (__m128i*) re; __m128i* Ims = (__m128i*) im; for (int i = 0; i < len1 / wlen; i++){ __m128i mx = _mm_loadu_si128(&Xs[i]); __m128i my = _mm_loadu_si128(&Ys[i]); __m128i ms2 = _mm_xor_si128(my, xmm5); ms2 = _mm_add_epi32(ms2, xmm4); ms2 = _mm_shufflehi_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); _mm_storeu_si128(&Res[i], _mm_madd_epi16(my, mx)); _mm_storeu_si128(&Ims[i], _mm_madd_epi16(ms2, mx)); } for (int i = (len1 / wlen) * wlen; i < len1; i++){ re[i] = x[i].re * y[i].re + x[i].im * y[i].im ; im[i] = x[i].im * y[i].re - x[i].re * y[i].im ; } return 0; }
static void get4x4var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse, int *sum) { const __m128i zero = _mm_setzero_si128(); const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); const __m128i diff0 = _mm_sub_epi16(src0, ref0); const __m128i diff1 = _mm_sub_epi16(src1, ref1); // sum __m128i vsum = _mm_add_epi16(diff0, diff1); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); *sum = (int16_t)_mm_extract_epi16(vsum, 0); // sse vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1)); vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); *sse = _mm_cvtsi128_si32(vsum); }
void SubpixelMaximizer::fitUsingSSE3(float coef[FitMatrix::ROWS], const signed short data[3][3][3]) const { assert(FitMatrix::PADDEDCOLS == 32); __m128 localFitMatrixScale = _mm_set_ss(fitMatrix.scale); const short* localFitMatrix = fitMatrix(); // Load data into four SSE Registers __m128i x[4]; signed short* dataFlat = (signed short*) data; // flat arraw of 27 signed shorts x[0] = _mm_loadu_si128((__m128i*)(dataFlat + 0)); x[1] = _mm_loadu_si128((__m128i*)(dataFlat + 8)); x[2] = _mm_loadu_si128((__m128i*)(dataFlat + 16)); x[3] = _mm_loadu_si128((__m128i*)(dataFlat + 24)); x[3] = _mm_srli_si128(_mm_slli_si128(x[3], 10), 10); // Clear dataFlat[27..31] for(int i = 0; i < FitMatrix::ROWS; i++) { // Compute scalar product between ((float*)x)[0..31] and localFitMatrix __m128i sum = _mm_madd_epi16(x[0], *(__m128i*)(localFitMatrix + 0)); sum = _mm_add_epi32(sum, _mm_madd_epi16(x[1], *(__m128i*)(localFitMatrix + 8))); sum = _mm_add_epi32(sum, _mm_madd_epi16(x[2], *(__m128i*)(localFitMatrix + 16))); sum = _mm_add_epi32(sum, _mm_madd_epi16(x[3], *(__m128i*)(localFitMatrix + 24))); sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); _mm_store_ss(coef + i, _mm_mul_ss(_mm_cvtepi32_ps(sum), localFitMatrixScale)); localFitMatrix += 32; } }
static int SSE4x4(const uint8_t* a, const uint8_t* b) { const __m128i zero = _mm_setzero_si128(); // Load values. Note that we read 8 pixels instead of 4, // but the a/b buffers are over-allocated to that effect. const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[BPS * 0]); const __m128i a1 = _mm_loadl_epi64((const __m128i*)&a[BPS * 1]); const __m128i a2 = _mm_loadl_epi64((const __m128i*)&a[BPS * 2]); const __m128i a3 = _mm_loadl_epi64((const __m128i*)&a[BPS * 3]); const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[BPS * 0]); const __m128i b1 = _mm_loadl_epi64((const __m128i*)&b[BPS * 1]); const __m128i b2 = _mm_loadl_epi64((const __m128i*)&b[BPS * 2]); const __m128i b3 = _mm_loadl_epi64((const __m128i*)&b[BPS * 3]); // Combine pair of lines. const __m128i a01 = _mm_unpacklo_epi32(a0, a1); const __m128i a23 = _mm_unpacklo_epi32(a2, a3); const __m128i b01 = _mm_unpacklo_epi32(b0, b1); const __m128i b23 = _mm_unpacklo_epi32(b2, b3); // Convert to 16b. const __m128i a01s = _mm_unpacklo_epi8(a01, zero); const __m128i a23s = _mm_unpacklo_epi8(a23, zero); const __m128i b01s = _mm_unpacklo_epi8(b01, zero); const __m128i b23s = _mm_unpacklo_epi8(b23, zero); // subtract, square and accumulate const __m128i d0 = _mm_subs_epi16(a01s, b01s); const __m128i d1 = _mm_subs_epi16(a23s, b23s); const __m128i e0 = _mm_madd_epi16(d0, d0); const __m128i e1 = _mm_madd_epi16(d1, d1); const __m128i sum = _mm_add_epi32(e0, e1); int32_t tmp[4]; _mm_storeu_si128((__m128i*)tmp, sum); return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); }
//compute average channel_level on each (TX,RX) antenna pair int dl_channel_level(s16 *dl_ch, LTE_DL_FRAME_PARMS *frame_parms) { s16 rb; __m128i *dl_ch128; int avg; //clear average level avg128F = _mm_xor_si128(avg128F,avg128F); dl_ch128=(__m128i *)dl_ch; for (rb=0;rb<frame_parms->N_RB_DL;rb++) { avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[0],dl_ch128[0])); avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[1],dl_ch128[1])); avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[2],dl_ch128[2])); dl_ch128+=3; } avg = (((int*)&avg128F)[0] + ((int*)&avg128F)[1] + ((int*)&avg128F)[2] + ((int*)&avg128F)[3])/(frame_parms->N_RB_DL*12); _mm_empty(); _m_empty(); return(avg); }
static inline void inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len, const gint16 * icoeff, gint bstride) { gint i = 0; __m128i sum[4], t[4]; __m128i f = _mm_set_epi64x (0, *((long long *) icoeff)); const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride), (gint16 *) ((gint8 *) b + 1 * bstride), (gint16 *) ((gint8 *) b + 2 * bstride), (gint16 *) ((gint8 *) b + 3 * bstride) }; sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 (); f = _mm_unpacklo_epi16 (f, sum[0]); for (; i < len; i += 8) { t[0] = _mm_loadu_si128 ((__m128i *) (a + i)); sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[0] + i)))); sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[1] + i)))); sum[2] = _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[2] + i)))); sum[3] = _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[3] + i)))); } t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]); t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]); t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]); t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]); sum[0] = _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0], t[1])); sum[2] = _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2], t[3])); sum[0] = _mm_add_epi32 (sum[0], sum[2]); sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); sum[0] = _mm_madd_epi16 (sum[0], f); sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3))); sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1))); sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); sum[0] = _mm_packs_epi32 (sum[0], sum[0]); *o = _mm_extract_epi16 (sum[0], 0); }
SIMD_INLINE void InterpolateX2(const __m128i * alpha, __m128i * buffer) { __m128i src = _mm_load_si128(buffer); __m128i a = _mm_load_si128(alpha); __m128i u = _mm_madd_epi16(_mm_and_si128(src, K16_00FF), a); __m128i v = _mm_madd_epi16(_mm_and_si128(_mm_srli_si128(src, 1), K16_00FF), a); _mm_store_si128(buffer, _mm_or_si128(u, _mm_slli_si128(v, 2))); }
// Horizontal add (doubled) of two 16b values, result is 16b. // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ... static void HorizontalAddPack_SSE41(const __m128i* const A, const __m128i* const B, __m128i* const out) { const __m128i k2 = _mm_set1_epi16(2); const __m128i C = _mm_madd_epi16(*A, k2); const __m128i D = _mm_madd_epi16(*B, k2); *out = _mm_packs_epi32(C, D); }
//compute average channel_level on each (TX,RX) antenna pair int dl_channel_level(int16_t *dl_ch, LTE_DL_FRAME_PARMS *frame_parms) { int16_t rb; #if defined(__x86_64__) || defined(__i386__) __m128i *dl_ch128; #elif defined(__arm__) int16x4_t *dl_ch128; #endif int avg; //clear average level #if defined(__x86_64__) || defined(__i386__) avg128F = _mm_setzero_si128(); dl_ch128=(__m128i *)dl_ch; for (rb=0; rb<frame_parms->N_RB_DL; rb++) { avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[0],dl_ch128[0])); avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[1],dl_ch128[1])); avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[2],dl_ch128[2])); dl_ch128+=3; } #elif defined(__arm__) avg128F = vdupq_n_s32(0); dl_ch128=(int16x4_t *)dl_ch; for (rb=0; rb<frame_parms->N_RB_DL; rb++) { avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[0],dl_ch128[0])); avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[1],dl_ch128[1])); avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[2],dl_ch128[2])); avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[3],dl_ch128[3])); avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[4],dl_ch128[4])); avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[5],dl_ch128[5])); dl_ch128+=6; } #endif DevAssert( frame_parms->N_RB_DL ); avg = (((int*)&avg128F)[0] + ((int*)&avg128F)[1] + ((int*)&avg128F)[2] + ((int*)&avg128F)[3])/(frame_parms->N_RB_DL*12); #if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); #endif return(avg); }
static INLINE unsigned int highbd_masked_sad4xh_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y += 2) { const __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr), _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride])); const __m128i b = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr), _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride])); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packs_epi32(pred_l, pred_r); const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int x, y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 8) { const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, // so it is safe to do signed saturation here. const __m128i pred = _mm_packs_epi32(pred_l, pred_r); // There is no 16-bit SAD instruction, so we have to synthesize // an 8-element SAD. We do this by storing 4 32-bit partial SADs, // and accumulating them at the end const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have four 32-bit partial SADs stored in 'res'. res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
int vector_ps_short (const short* pa,const short* pb,size_t n) { size_t k; size_t q = n / 16; size_t r = n % 16; int w; if (q > 0) { __m128i acc1 = _mm_setzero_si128(); __m128i acc2 = _mm_setzero_si128(); if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) { for (k=0;k<q;k++) { /* Charge 16 mots dans chaque tableau */ __m128i a1 = _mm_load_si128((__m128i*)pa); __m128i b1 = _mm_load_si128((__m128i*)pb); __m128i a2 = _mm_load_si128((__m128i*)(pa+8)); __m128i b2 = _mm_load_si128((__m128i*)(pb+8)); /* Multiple, somme et converti en double word */ __m128i s1 = _mm_madd_epi16(a1,b1); __m128i s2 = _mm_madd_epi16(a2,b2); pa += 16; pb += 16; /* Accumule */ acc1 = _mm_add_epi32(acc1,s1); acc2 = _mm_add_epi32(acc2,s2); } } else { for (k=0;k<q;k++) { /* Charge 16 mots dans chaque tableau */ __m128i a1 = _mm_loadu_si128((__m128i*)pa); __m128i b1 = _mm_loadu_si128((__m128i*)pb); __m128i a2 = _mm_loadu_si128((__m128i*)(pa+8)); __m128i b2 = _mm_loadu_si128((__m128i*)(pb+8)); /* Multiple, somme et converti en double word */ __m128i s1 = _mm_madd_epi16(a1,b1); __m128i s2 = _mm_madd_epi16(a2,b2); pa += 16; pb += 16; /* Accumule */ acc1 = _mm_add_epi32(acc1,s1); acc2 = _mm_add_epi32(acc2,s2); } } /* Somme finale */ acc1 = _mm_add_epi32(acc1,acc2); acc1 = _mm_hadd_epi32(acc1,acc1); acc1 = _mm_hadd_epi32(acc1,acc1); w = _mm_extract_epi32(acc1,0); } else { w = 0; } for (k=0;k<r;k++) w += (*pa++) * (*pb++); return w; }
int64_t av1_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bps) { int i, j, test; uint32_t temp[4]; __m128i max, min, cmp0, cmp1, cmp2, cmp3; int64_t error = 0, sqcoeff = 0; const int shift = 2 * (bps - 8); const int rounding = shift > 0 ? 1 << (shift - 1) : 0; for (i = 0; i < block_size; i += 8) { // Load the data into xmm registers __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i)); __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4)); __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i)); __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4)); // Check if any values require more than 15 bit max = _mm_set1_epi32(0x3fff); min = _mm_set1_epi32(0xffffc000); cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max), _mm_cmplt_epi32(mm_coeff, min)); cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max), _mm_cmplt_epi32(mm_coeff2, min)); cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max), _mm_cmplt_epi32(mm_dqcoeff, min)); cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max), _mm_cmplt_epi32(mm_dqcoeff2, min)); test = _mm_movemask_epi8( _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3))); if (!test) { __m128i mm_diff, error_sse2, sqcoeff_sse2; mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2); mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2); mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff); error_sse2 = _mm_madd_epi16(mm_diff, mm_diff); sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff); _mm_storeu_si128((__m128i *)temp, error_sse2); error = error + temp[0] + temp[1] + temp[2] + temp[3]; _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2); sqcoeff += temp[0] + temp[1] + temp[2] + temp[3]; } else { for (j = 0; j < 8; j++) { const int64_t diff = coeff[i + j] - dqcoeff[i + j]; error += diff * diff; sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j]; } } } assert(error >= 0 && sqcoeff >= 0); error = (error + rounding) >> shift; sqcoeff = (sqcoeff + rounding) >> shift; *ssz = sqcoeff; return error; }
// // multiplies two complex vectors and returns the real and imaginary parts // as two 32 bit integers. // int __ext_v_conj_mul_complex16_int32(int32* re, int lenout1, int32* im, int lenout2, struct complex16* x, int len1, struct complex16* y, int len2 ) { const int wlen = 4;// sizeof(vcs) / sizeof(complex16); const __m128i xmm6 = _mm_set1_epi32(0x0000FFFF); //0x0000FFFF0000FFFF0000FFFF0000FFFF const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000); const __m128i xmm4 = _mm_set1_epi32(0x00010000); for (int i = 0; i < len1 / wlen; i++){ /* vcs *vx = (vcs *)(x + wlen*i); vcs *vy = (vcs *)(y + wlen*i); vi *reout = (vi *)(re + wlen*i); vi *imout = (vi *)(im + wlen*i); vcs vs2 = conj0(*vy); vs2 = permutate_low<1, 0, 3, 2>(vs2); vs2 = permutate_high<1, 0, 3, 2>(vs2); *reout = (vcs)muladd(*vx, *vy); *imout = (vcs)muladd(*vx, vs2);*/ __m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i)); __m128i my = _mm_loadu_si128((__m128i *)(y + wlen*i)); //__m128i ms1 = _mm_sign_epi16(my, conj); __m128i ms2 = _mm_xor_si128(my, xmm5); ms2 = _mm_add_epi32(ms2, xmm4); ms2 = _mm_shufflehi_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); __m128i mre = _mm_madd_epi16(my, mx); __m128i mim = _mm_madd_epi16(ms2, mx); _mm_storeu_si128((__m128i *) (re + wlen*i), mre); _mm_storeu_si128((__m128i *) (im + wlen*i), mim); } for (int i = (len1 / wlen) * wlen; i < len1; i++){ re[i] = x[i].re * y[i].re + x[i].im * y[i].im ; im[i] = x[i].im * y[i].re - x[i].re * y[i].im ; }; return 0; }
opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y, int N) { opus_int i, dataSize16; opus_int32 sum; __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; sum = 0; dataSize16 = N & ~15; acc1 = _mm_setzero_si128(); acc2 = _mm_setzero_si128(); for (i=0;i<dataSize16;i+=16) { inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); acc1 = _mm_add_epi32(acc1, inVec1_76543210); acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); } acc1 = _mm_add_epi32( acc1, acc2 ); if (N - i >= 8) { inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); acc1 = _mm_add_epi32(acc1, inVec1_76543210); i += 8; } acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1)); acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E)); sum += _mm_cvtsi128_si32(acc1); for (;i<N;i++) { sum = silk_SMLABB(sum, x[i], y[i]); } return sum; }
SIMD_INLINE __m128i SquaredDifference(__m128i a, __m128i b) { const __m128i aLo = _mm_unpacklo_epi8(a, _mm_setzero_si128()); const __m128i bLo = _mm_unpacklo_epi8(b, _mm_setzero_si128()); const __m128i dLo = _mm_sub_epi16(aLo, bLo); const __m128i aHi = _mm_unpackhi_epi8(a, _mm_setzero_si128()); const __m128i bHi = _mm_unpackhi_epi8(b, _mm_setzero_si128()); const __m128i dHi = _mm_sub_epi16(aHi, bHi); return _mm_add_epi32(_mm_madd_epi16(dLo, dLo), _mm_madd_epi16(dHi, dHi)); }
/* ----------------------------------- * weighted_merge_planar * ----------------------------------- */ void weighted_merge_planar_sse2(BYTE *p1, const BYTE *p2, int p1_pitch, int p2_pitch, int width, int height, int weight, int invweight) { __m128i round_mask = _mm_set1_epi32(0x4000); __m128i zero = _mm_setzero_si128(); __m128i mask = _mm_set_epi16(weight, invweight, weight, invweight, weight, invweight, weight, invweight); int wMod16 = (width/16) * 16; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod16; x += 16) { __m128i px1 = _mm_load_si128(reinterpret_cast<const __m128i*>(p1+x)); //y7y6 y5y4 y3y2 y1y0 __m128i px2 = _mm_load_si128(reinterpret_cast<const __m128i*>(p2+x)); //Y7Y6 Y5Y4 Y3Y2 Y1Y0 __m128i p0123 = _mm_unpacklo_epi8(px1, px2); //Y3y3 Y2y2 Y1y1 Y0y0 __m128i p4567 = _mm_unpackhi_epi8(px1, px2); //Y7y7 Y6y6 Y5y5 Y4y4 __m128i p01 = _mm_unpacklo_epi8(p0123, zero); //00Y1 00y1 00Y0 00y0 __m128i p23 = _mm_unpackhi_epi8(p0123, zero); //00Y3 00y3 00Y2 00y2 __m128i p45 = _mm_unpacklo_epi8(p4567, zero); //00Y5 00y5 00Y4 00y4 __m128i p67 = _mm_unpackhi_epi8(p4567, zero); //00Y7 00y7 00Y6 00y6 p01 = _mm_madd_epi16(p01, mask); p23 = _mm_madd_epi16(p23, mask); p45 = _mm_madd_epi16(p45, mask); p67 = _mm_madd_epi16(p67, mask); p01 = _mm_add_epi32(p01, round_mask); p23 = _mm_add_epi32(p23, round_mask); p45 = _mm_add_epi32(p45, round_mask); p67 = _mm_add_epi32(p67, round_mask); p01 = _mm_srli_epi32(p01, 15); p23 = _mm_srli_epi32(p23, 15); p45 = _mm_srli_epi32(p45, 15); p67 = _mm_srli_epi32(p67, 15); p0123 = _mm_packs_epi32(p01, p23); p4567 = _mm_packs_epi32(p45, p67); __m128i result = _mm_packus_epi16(p0123, p4567); _mm_store_si128(reinterpret_cast<__m128i*>(p1+x), result); } for (int x = wMod16; x < width; x++) { p1[x] = (p1[x]*invweight + p2[x]*weight + 16384) >> 15; } p1 += p1_pitch; p2 += p2_pitch; } }
static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int width, const int height) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - width; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); assert(width >= 8); assert(IS_POWER_OF_TWO(width)); do { const __m128i v_p1_w = xx_loadl_64(pre + n + 4); const __m128i v_m1_d = xx_load_128(mask + n + 4); const __m128i v_w1_d = xx_load_128(wsrc + n + 4); const __m128i v_p0_w = xx_loadl_64(pre + n); const __m128i v_m0_d = xx_load_128(mask + n); const __m128i v_w0_d = xx_load_128(wsrc + n); const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); // Rounded absolute difference const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); n += 8; if (n % width == 0) pre += pre_step; } while (n < width * height); return xx_hsum_epi32_si32(v_sad_d); }
/* ----------------------------------- * weighted_merge_luma_yuy2 * ----------------------------------- */ static void weighted_merge_luma_yuy2_sse2(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height, int weight, int invweight) { __m128i round_mask = _mm_set1_epi32(0x4000); __m128i mask = _mm_set_epi16(weight, invweight, weight, invweight, weight, invweight, weight, invweight); __m128i luma_mask = _mm_set1_epi16(0x00FF); #pragma warning(push) #pragma warning(disable: 4309) __m128i chroma_mask = _mm_set1_epi16(0xFF00); #pragma warning(pop) int wMod16 = (width/16) * 16; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod16; x += 16) { __m128i px1 = _mm_load_si128(reinterpret_cast<const __m128i*>(src+x)); //V1 Y3 U1 Y2 V0 Y1 U0 Y0 __m128i px2 = _mm_load_si128(reinterpret_cast<const __m128i*>(luma+x)); //v1 y3 u1 y2 v0 y1 u0 y0 __m128i src_lo = _mm_unpacklo_epi16(px1, px2); //v0 y1 V0 Y1 u0 y0 U0 Y0 __m128i src_hi = _mm_unpackhi_epi16(px1, px2); src_lo = _mm_and_si128(src_lo, luma_mask); //00 v0 00 V0 00 u0 00 U0 src_hi = _mm_and_si128(src_hi, luma_mask); src_lo = _mm_madd_epi16(src_lo, mask); src_hi = _mm_madd_epi16(src_hi, mask); src_lo = _mm_add_epi32(src_lo, round_mask); src_hi = _mm_add_epi32(src_hi, round_mask); src_lo = _mm_srli_epi32(src_lo, 15); src_hi = _mm_srli_epi32(src_hi, 15); __m128i result_luma = _mm_packs_epi32(src_lo, src_hi); __m128i result_chroma = _mm_and_si128(px1, chroma_mask); __m128i result = _mm_or_si128(result_chroma, result_luma); _mm_store_si128(reinterpret_cast<__m128i*>(src+x), result); } for (int x = wMod16; x < width; x+=2) { src[x] = (luma[x] * weight + src[x] * invweight + 16384) >> 15; } src += pitch; luma += luma_pitch; } }
SIMD_INLINE __m128i BgraToGray32(__m128i bgra) { const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(bgra, 1), K16_00FF); const __m128i b0r0 = _mm_and_si128(bgra, K16_00FF); const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_0000), _mm_madd_epi16(b0r0, K16_BLUE_RED)); return _mm_srli_epi32(_mm_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); }
static void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff) { #if defined(__SSE2__) { for(unsigned col = 0; col < 8; col++) { __m128i c = _mm_load_si128((__m128i *)&in_coeff[(col * 8)]); for(unsigned x = 0; x < 8; x++) { __m128i sum; __m128i m; int32 tmp[4] MDFN_ALIGN(16); m = _mm_load_si128((__m128i *)&IDCTMatrix[(x * 8)]); sum = _mm_madd_epi16(m, c); sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6))); sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (1 << 0) | (0 << 2))); //_mm_store_ss((float *)&tmp[0], (__m128)sum); _mm_store_si128((__m128i*)tmp, sum); if(sizeof(T) == 1) out_coeff[(col * 8) + x] = Mask9ClampS8((tmp[0] + 0x4000) >> 15); else out_coeff[(x * 8) + col] = (tmp[0] + 0x4000) >> 15; } } }
static WEBP_INLINE __m128i SubtractAndAccumulate(const __m128i a, const __m128i b) { // take abs(a-b) in 8b const __m128i a_b = _mm_subs_epu8(a, b); const __m128i b_a = _mm_subs_epu8(b, a); const __m128i abs_a_b = _mm_or_si128(a_b, b_a); // zero-extend to 16b const __m128i C0 = _mm_cvtepu8_epi16(abs_a_b); const __m128i C1 = _mm_cvtepu8_epi16(_mm_srli_si128(abs_a_b, 8)); // multiply with self const __m128i D0 = _mm_madd_epi16(C0, C0); const __m128i D1 = _mm_madd_epi16(C1, C1); // accumulate const __m128i sum = _mm_add_epi32(D0, D1); return sum; }
static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int height) { const int pre_step = pre_stride - 4; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); do { const __m128i v_p_b = xx_loadl_32(pre + n); const __m128i v_m_d = xx_load_128(mask + n); const __m128i v_w_d = xx_load_128(wsrc + n); const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); // Rounded absolute difference const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); n += 4; if (n % 4 == 0) pre += pre_step; } while (n < 4 * height); return xx_hsum_epi32_si32(v_sad_d); }
static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) { const __m128i zero = _mm_setzero_si128(); // Load values. Note that we read 8 pixels instead of 4, // but the a/b buffers are over-allocated to that effect. const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]); const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]); const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]); const __m128i a3 = _mm_loadl_epi64((__m128i*)&a[BPS * 3]); const __m128i b0 = _mm_loadl_epi64((__m128i*)&b[BPS * 0]); const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]); const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]); const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]); // Combine pair of lines and convert to 16b. const __m128i a01 = _mm_unpacklo_epi32(a0, a1); const __m128i a23 = _mm_unpacklo_epi32(a2, a3); const __m128i b01 = _mm_unpacklo_epi32(b0, b1); const __m128i b23 = _mm_unpacklo_epi32(b2, b3); const __m128i a01s = _mm_unpacklo_epi8(a01, zero); const __m128i a23s = _mm_unpacklo_epi8(a23, zero); const __m128i b01s = _mm_unpacklo_epi8(b01, zero); const __m128i b23s = _mm_unpacklo_epi8(b23, zero); // Compute differences; (a-b)^2 = (abs(a-b))^2 = (sat8(a-b) + sat8(b-a))^2 // TODO(cduvivier): Dissassemble and figure out why this is fastest. We don't // need absolute values, there is no need to do calculation // in 8bit as we are already in 16bit, ... Yet this is what // benchmarks the fastest! const __m128i d0 = _mm_subs_epu8(a01s, b01s); const __m128i d1 = _mm_subs_epu8(b01s, a01s); const __m128i d2 = _mm_subs_epu8(a23s, b23s); const __m128i d3 = _mm_subs_epu8(b23s, a23s); // Square and add them all together. const __m128i madd0 = _mm_madd_epi16(d0, d0); const __m128i madd1 = _mm_madd_epi16(d1, d1); const __m128i madd2 = _mm_madd_epi16(d2, d2); const __m128i madd3 = _mm_madd_epi16(d3, d3); const __m128i sum0 = _mm_add_epi32(madd0, madd1); const __m128i sum1 = _mm_add_epi32(madd2, madd3); const __m128i sum2 = _mm_add_epi32(sum0, sum1); int32_t tmp[4]; _mm_storeu_si128((__m128i*)tmp, sum2); return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); }