int add_real_vector64_scalar(short *x, long long int a, short *y, unsigned int N) { unsigned int i; // loop counter __m128i *x_128; __m128i *y_128; x_128 = (__m128i *)&x[0]; y_128 = (__m128i *)&y[0]; alpha_128 = _mm_set1_epi64((__m64) a); // we compute 4 cpx multiply for each loop for(i=0;i<(N>>3);i++) { y_128[0] = _mm_add_epi64(alpha_128, x_128[0]); y_128[1] = _mm_add_epi64(alpha_128, x_128[1]); y_128[2] = _mm_add_epi64(alpha_128, x_128[2]); y_128[3] = _mm_add_epi64(alpha_128, x_128[3]); x_128+=4; y_128+=4; } return(0); }
static WEBP_INLINE void ProcessRow(const __m128i* const A0, const __m128i* const A1, const __m128i* const A2, const __m128i* const A3, const __m128i* const mult, uint8_t* const dst) { const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0); const __m128i B0 = _mm_mul_epu32(*A0, *mult); const __m128i B1 = _mm_mul_epu32(*A1, *mult); const __m128i B2 = _mm_mul_epu32(*A2, *mult); const __m128i B3 = _mm_mul_epu32(*A3, *mult); const __m128i C0 = _mm_add_epi64(B0, rounder); const __m128i C1 = _mm_add_epi64(B1, rounder); const __m128i C2 = _mm_add_epi64(B2, rounder); const __m128i C3 = _mm_add_epi64(B3, rounder); const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX); const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX); const __m128i D2 = _mm_and_si128(C2, mask); const __m128i D3 = _mm_and_si128(C3, mask); const __m128i E0 = _mm_or_si128(D0, D2); const __m128i E1 = _mm_or_si128(D1, D3); const __m128i F = _mm_packs_epi32(E0, E1); const __m128i G = _mm_packus_epi16(F, F); _mm_storel_epi64((__m128i*)dst, G); }
int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz) { __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; __m256i sse_reg_64hi, ssz_reg_64hi; __m128i sse_reg128, ssz_reg128; int64_t sse; int i; const __m256i zero_reg = _mm256_set1_epi16(0); // init sse and ssz registerd to zero sse_reg = _mm256_set1_epi16(0); ssz_reg = _mm256_set1_epi16(0); for (i = 0 ; i < block_size ; i+= 16) { // load 32 bytes from coeff and dqcoeff coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i)); dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i)); // dqcoeff - coeff dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); // madd (dqcoeff - coeff) dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); // madd coeff coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); // expand each double word of madd (dqcoeff - coeff) to quad word exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg); exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg); // expand each double word of madd (coeff) to quad word exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg); exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg); // add each quad word of madd (dqcoeff - coeff) and madd (coeff) sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo); ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo); sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi); ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi); } // save the higher 64 bit of each 128 bit lane sse_reg_64hi = _mm256_srli_si256(sse_reg, 8); ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8); // add the higher 64 bit to the low 64 bit sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi); ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi); // add each 64 bit from each of the 128 bit lane of the 256 bit sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1)); ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg), _mm256_extractf128_si256(ssz_reg, 1)); // store the results _mm_storel_epi64((__m128i*)(&sse), sse_reg128); _mm_storel_epi64((__m128i*)(ssz), ssz_reg128); return sse; }
void Convert444to420(LPBYTE input, int width, int pitch, int height, int startY, int endY, LPBYTE *output, bool bSSE2Available) { LPBYTE lumPlane = output[0]; LPBYTE uPlane = output[1]; LPBYTE vPlane = output[2]; int chrPitch = width>>1; if(bSSE2Available) { __m128i lumMask = _mm_set1_epi32(0x0000FF00); __m128i uvMask = _mm_set1_epi16(0x00FF); for(int y=startY; y<endY; y+=2) { int yPos = y*pitch; int chrYPos = ((y>>1)*chrPitch); int lumYPos = y*width; for(int x=0; x<width; x+=4) { LPBYTE lpImagePos = input+yPos+(x*4); int chrPos = chrYPos + (x>>1); int lumPos0 = lumYPos + x; int lumPos1 = lumPos0+width; __m128i line1 = _mm_load_si128((__m128i*)lpImagePos); __m128i line2 = _mm_load_si128((__m128i*)(lpImagePos+pitch)); //pack lum vals { __m128i packVal = _mm_packs_epi32(_mm_srli_si128(_mm_and_si128(line1, lumMask), 1), _mm_srli_si128(_mm_and_si128(line2, lumMask), 1)); packVal = _mm_packus_epi16(packVal, packVal); *(LPUINT)(lumPlane+lumPos0) = packVal.m128i_u32[0]; *(LPUINT)(lumPlane+lumPos1) = packVal.m128i_u32[1]; } //do average, pack UV vals { __m128i addVal = _mm_add_epi64(_mm_and_si128(line1, uvMask), _mm_and_si128(line2, uvMask)); __m128i avgVal = _mm_srai_epi16(_mm_add_epi64(addVal, _mm_shuffle_epi32(addVal, _MM_SHUFFLE(2, 3, 0, 1))), 2); avgVal = _mm_shuffle_epi32(avgVal, _MM_SHUFFLE(3, 1, 2, 0)); avgVal = _mm_shufflelo_epi16(avgVal, _MM_SHUFFLE(3, 1, 2, 0)); avgVal = _mm_packus_epi16(avgVal, avgVal); DWORD packedVals = avgVal.m128i_u32[0]; *(LPWORD)(uPlane+chrPos) = WORD(packedVals); *(LPWORD)(vPlane+chrPos) = WORD(packedVals>>16); } } } } else { #ifdef _WIN64 for(int y=startY; y<endY; y+=2)
/** * Processes two doubles at a time */ int _mandelbrot_2( double const * const c_re_arg, double const * const c_im_arg, int max_iter ) { __m128d z_re = _mm_load_pd(c_re_arg); __m128d z_im = _mm_load_pd(c_im_arg); __m128d y_re; __m128d y_im; __m128d c_re = z_re; __m128d c_im = z_im; __m128i count = _mm_set1_epi64x(0); __m128d md; __m128d mt; __m128i mi = _mm_set1_epi16(0xffff);; __m128d two = _mm_set1_pd(2.0); __m128i one = _mm_set1_epi64x(1); for (int i = 0; i<max_iter; i+=1) { // y = z .* z; y_re = _mm_mul_pd(z_re, z_re); y_im = _mm_mul_pd(z_im, z_im); // y = z * z; y_re = _mm_sub_pd(y_re, y_im); y_im = _mm_mul_pd(z_re, z_im); y_im = _mm_add_pd(y_im, y_im); // z = z * z + c z_re = _mm_add_pd(y_re, c_re); z_im = _mm_add_pd(y_im, c_im); // if condition // md = _mm_add_pd(z_re, z_im); // md = _mm_cmplt_pd(md, four); md = _mm_cmplt_pd(z_re, two); mt = _mm_cmplt_pd(z_im, two); md = _mm_and_pd(md, mt); mi = _mm_and_si128(mi, (__m128i) md); // PRINT_M128I(mi); if ( !_mm_movemask_pd(md) ) { break; } // count iterations count = _mm_add_epi64( count, _mm_and_si128( mi, one) ); } int val; count = _mm_add_epi64( _mm_srli_si128(count, 8), count ); val = _mm_cvtsi128_si64( count ); return val; }
static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); __m128i v_acc0_q = _mm_setzero_si128(); __m128i v_acc1_q = _mm_setzero_si128(); const int16_t *const end = src + n; assert(n % 64 == 0); while (src < end) { const __m128i v_val_0_w = xx_load_128(src); const __m128i v_val_1_w = xx_load_128(src + 8); const __m128i v_val_2_w = xx_load_128(src + 16); const __m128i v_val_3_w = xx_load_128(src + 24); const __m128i v_val_4_w = xx_load_128(src + 32); const __m128i v_val_5_w = xx_load_128(src + 40); const __m128i v_val_6_w = xx_load_128(src + 48); const __m128i v_val_7_w = xx_load_128(src + 56); const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q)); v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32)); src += 64; } v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); #if ARCH_X86_64 return (uint64_t)_mm_cvtsi128_si64(v_acc0_q); #else { uint64_t tmp; _mm_storel_epi64((__m128i *)&tmp, v_acc0_q); return tmp; } #endif }
/** ******************************************************************************* * * @brief * Compute 8x4 SAD * * @par Description * Compute 8x4 sum of absolute differences between source and reference block * * @param[in] pu1_src * Source buffer * * @param[in] pu1_ref * Reference buffer * * @param[in] src_strd * Source stride * * @param[in] ref_strd * Reference stride * * @param[in] wd * Assumed to be 8 * * @param[in] ht * Assumed to be 4 * @returns * SAD * * @remarks * ******************************************************************************* */ WORD32 icv_sad_8x4_ssse3(UWORD8 *pu1_src, UWORD8 *pu1_ref, WORD32 src_strd, WORD32 ref_strd, WORD32 wd, WORD32 ht) { WORD32 sad; __m128 src_r0, src_r1; __m128 ref_r0, ref_r1; __m128i res_r0, res_r1; UNUSED(wd); UNUSED(ht); ASSERT(wd == 8); ASSERT(ht == 4); /* Load source */ src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src)); pu1_src += src_strd; src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src)); pu1_src += src_strd; src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src)); pu1_src += src_strd; src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src)); pu1_src += src_strd; /* Load reference */ ref_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_ref)); pu1_ref += ref_strd; ref_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_ref)); pu1_ref += ref_strd; ref_r0 = _mm_loadh_pi (ref_r0, (__m64 *) (pu1_ref)); pu1_ref += ref_strd; ref_r1 = _mm_loadh_pi (ref_r1, (__m64 *) (pu1_ref)); pu1_ref += ref_strd; /* Compute SAD for each row */ res_r0 = _mm_sad_epu8((__m128i)src_r0, (__m128i)ref_r0); res_r1 = _mm_sad_epu8((__m128i)src_r1, (__m128i)ref_r1); /* Accumulate SAD */ res_r0 = _mm_add_epi64(res_r0, res_r1); res_r0 = _mm_add_epi64(res_r0, _mm_srli_si128(res_r0, 8)); sad = _mm_cvtsi128_si32(res_r0); return sad; }
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, const uint8_t* src) { const int x_sub = wrk->x_sub; int accum = 0; const __m128i zero = _mm_setzero_si128(); const __m128i mult0 = _mm_set1_epi16(x_sub); const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale); const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); __m128i sum = zero; rescaler_t* frow = wrk->frow; const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width; if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) { WebPRescalerImportRowShrink_C(wrk, src); return; } assert(!WebPRescalerInputDone(wrk)); assert(!wrk->x_expand); for (; frow < frow_end; frow += 4) { __m128i base = zero; accum += wrk->x_add; while (accum > 0) { const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src)); src += 4; base = _mm_unpacklo_epi8(A, zero); // To avoid overflow, we need: base * x_add / x_sub < 32768 // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit. sum = _mm_add_epi16(sum, base); accum -= x_sub; } { // Emit next horizontal pixel. const __m128i mult = _mm_set1_epi16(-accum); const __m128i frac0 = _mm_mullo_epi16(base, mult); // 16b x 16b -> 32b const __m128i frac1 = _mm_mulhi_epu16(base, mult); const __m128i frac = _mm_unpacklo_epi16(frac0, frac1); // frac is 32b const __m128i A0 = _mm_mullo_epi16(sum, mult0); const __m128i A1 = _mm_mulhi_epu16(sum, mult0); const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // sum * x_sub const __m128i frow_out = _mm_sub_epi32(B0, frac); // sum * x_sub - frac const __m128i D0 = _mm_srli_epi64(frac, 32); const __m128i D1 = _mm_mul_epu32(frac, mult1); // 32b x 16b -> 64b const __m128i D2 = _mm_mul_epu32(D0, mult1); const __m128i E1 = _mm_add_epi64(D1, rounder); const __m128i E2 = _mm_add_epi64(D2, rounder); const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2)); const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2)); const __m128i G = _mm_unpacklo_epi32(F1, F2); sum = _mm_packs_epi32(G, zero); _mm_storeu_si128((__m128i*)frow, frow_out); } } assert(accum == 0); }
static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) { int x_out; uint8_t* const dst = wrk->dst; rescaler_t* const irow = wrk->irow; const int x_out_max = wrk->dst_width * wrk->num_channels; const rescaler_t* const frow = wrk->frow; const __m128i mult = _mm_set_epi32(0, wrk->fy_scale, 0, wrk->fy_scale); assert(!WebPRescalerOutputDone(wrk)); assert(wrk->y_accum <= 0 && wrk->y_sub + wrk->y_accum >= 0); assert(wrk->y_expand); if (wrk->y_accum == 0) { for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) { __m128i A0, A1, A2, A3; LoadDispatchAndMult(frow + x_out, NULL, &A0, &A1, &A2, &A3); ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out); } for (; x_out < x_out_max; ++x_out) { const uint32_t J = frow[x_out]; const int v = (int)MULT_FIX(J, wrk->fy_scale); assert(v >= 0 && v <= 255); dst[x_out] = v; } } else { const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub); const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B); const __m128i mA = _mm_set_epi32(0, A, 0, A); const __m128i mB = _mm_set_epi32(0, B, 0, B); const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) { __m128i A0, A1, A2, A3, B0, B1, B2, B3; LoadDispatchAndMult(frow + x_out, &mA, &A0, &A1, &A2, &A3); LoadDispatchAndMult(irow + x_out, &mB, &B0, &B1, &B2, &B3); { const __m128i C0 = _mm_add_epi64(A0, B0); const __m128i C1 = _mm_add_epi64(A1, B1); const __m128i C2 = _mm_add_epi64(A2, B2); const __m128i C3 = _mm_add_epi64(A3, B3); const __m128i D0 = _mm_add_epi64(C0, rounder); const __m128i D1 = _mm_add_epi64(C1, rounder); const __m128i D2 = _mm_add_epi64(C2, rounder); const __m128i D3 = _mm_add_epi64(C3, rounder); const __m128i E0 = _mm_srli_epi64(D0, WEBP_RESCALER_RFIX); const __m128i E1 = _mm_srli_epi64(D1, WEBP_RESCALER_RFIX); const __m128i E2 = _mm_srli_epi64(D2, WEBP_RESCALER_RFIX); const __m128i E3 = _mm_srli_epi64(D3, WEBP_RESCALER_RFIX); ProcessRow(&E0, &E1, &E2, &E3, &mult, dst + x_out); } } for (; x_out < x_out_max; ++x_out) { const uint64_t I = (uint64_t)A * frow[x_out] + (uint64_t)B * irow[x_out]; const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX); const int v = (int)MULT_FIX(J, wrk->fy_scale); assert(v >= 0 && v <= 255); dst[x_out] = v; } } }
opus_int64 silk_inner_prod16_aligned_64_sse4_1( const opus_int16 *inVec1, /* I input vector 1 */ const opus_int16 *inVec2, /* I input vector 2 */ const opus_int len /* I vector lengths */ ) { opus_int i, dataSize8; opus_int64 sum; __m128i xmm_tempa; __m128i inVec1_76543210, acc1; __m128i inVec2_76543210, acc2; sum = 0; dataSize8 = len & ~7; acc1 = _mm_setzero_si128(); acc2 = _mm_setzero_si128(); for( i = 0; i < dataSize8; i += 8 ) { inVec1_76543210 = _mm_loadu_si128( (__m128i*)(&inVec1[i + 0] ) ); inVec2_76543210 = _mm_loadu_si128( (__m128i*)(&inVec2[i + 0] ) ); /* only when all 4 operands are -32768 (0x8000), this results in wrap around */ inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 ); xmm_tempa = _mm_cvtepi32_epi64( inVec1_76543210 ); /* equal shift right 8 bytes */ inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) ); inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 ); acc1 = _mm_add_epi64( acc1, xmm_tempa ); acc2 = _mm_add_epi64( acc2, inVec1_76543210 ); } acc1 = _mm_add_epi64( acc1, acc2 ); /* equal shift right 8 bytes */ acc2 = _mm_shuffle_epi32( acc1, _MM_SHUFFLE( 0, 0, 3, 2 ) ); acc1 = _mm_add_epi64( acc1, acc2 ); _mm_storel_epi64( (__m128i *)&sum, acc1 ); for( ; i < len; i++ ) { sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] ); } return sum; }
void generate(std::array<M128I<U>, Rp1> &rk, const M128I<std::uint64_t> &weyl, std::true_type) const { std::get<N>(rk) = _mm_add_epi64(std::get<N - 1>(rk).value(), weyl.value()); generate<N + 1>(rk, weyl, std::integral_constant<bool, N + 1 < Rp1>()); }
void*drawman(void*x){ int c=col++; unsigned _m=mx,mxx=16777216/_m; double _x=xx,_y=yy,_w=wh; do{ __m128d cr=_mm_set1_pd(_x+_w*c); for(int j=0;j<512;j+=2){ __m128d zr=cr, zi=_mm_set_pd(_y+_w*j,_y+_w*(j+1)),ci=zi, zr2=_mm_mul_pd(zr,zr),zi2=_mm_mul_pd(zi,zi); unsigned mk=mx-1; uint64_t kk[2]__attribute__((aligned(16)))={mk,mk}; __m128i k=_mm_load_si128((__m128i*)kk); do{ zi=_mm_mul_pd(zi,zr); zi=_mm_add_pd(_mm_add_pd(zi,zi),ci); zr=_mm_add_pd(_mm_sub_pd(zr2,zi2),cr); zr2=_mm_mul_pd(zr,zr); zi2=_mm_mul_pd(zi,zi); __m128d n=_mm_cmplt_pd(_mm_add_pd(zr2,zi2),_mm_set1_pd(4)); if(!_mm_movemask_pd(n))break; k=_mm_add_epi64(k,_mm_castpd_si128(n)); }while(--mk); _mm_store_si128((__m128i*)kk,k); manor[c][j]=kk[1]*mxx>>16; manor[c][j+1]=kk[0]*mxx>>16; } done[c>>6]|=1ULL<<(c&63); c=col++; }while(c<512&&!pull); }
template <bool align> void SquaredDifferenceSum( const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, size_t width, size_t height, uint64_t * sum) { assert(width < 0x10000); if(align) { assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); } size_t bodyWidth = AlignLo(width, A); __m128i tailMask = ShiftLeft(K_INV_ZERO, A - width + bodyWidth); __m128i fullSum = _mm_setzero_si128(); for(size_t row = 0; row < height; ++row) { __m128i rowSum = _mm_setzero_si128(); for(size_t col = 0; col < bodyWidth; col += A) { const __m128i a_ = Load<align>((__m128i*)(a + col)); const __m128i b_ = Load<align>((__m128i*)(b + col)); rowSum = _mm_add_epi32(rowSum, SquaredDifference(a_, b_)); } if(width - bodyWidth) { const __m128i a_ = _mm_and_si128(tailMask, Load<false>((__m128i*)(a + width - A))); const __m128i b_ = _mm_and_si128(tailMask, Load<false>((__m128i*)(b + width - A))); rowSum = _mm_add_epi32(rowSum, SquaredDifference(a_, b_)); } fullSum = _mm_add_epi64(fullSum, HorizontalSum32(rowSum)); a += aStride; b += bStride; } *sum = ExtractInt64Sum(fullSum); }
__m128i test_mm_add_epi64(__m128i A, __m128i B) { // DAG-LABEL: test_mm_add_epi64 // DAG: add <2 x i64> // // ASM-LABEL: test_mm_add_epi64 // ASM: paddq return _mm_add_epi64(A, B); }
void Convert444toNV12(LPBYTE input, int width, int inPitch, int outPitch, int height, int startY, int endY, LPBYTE *output) { LPBYTE lumPlane = output[0]; LPBYTE uvPlane = output[1]; __m128i lumMask = _mm_set1_epi32(0x0000FF00); __m128i uvMask = _mm_set1_epi16(0x00FF); for(int y=startY; y<endY; y+=2) { int yPos = y*inPitch; int uvYPos = (y>>1)*outPitch; int lumYPos = y*outPitch; for(int x=0; x<width; x+=4) { LPBYTE lpImagePos = input+yPos+(x*4); int uvPos = uvYPos + x; int lumPos0 = lumYPos + x; int lumPos1 = lumPos0 + outPitch; __m128i line1 = _mm_load_si128((__m128i*)lpImagePos); __m128i line2 = _mm_load_si128((__m128i*)(lpImagePos+inPitch)); //pack lum vals { __m128i packVal = _mm_packs_epi32(_mm_srli_si128(_mm_and_si128(line1, lumMask), 1), _mm_srli_si128(_mm_and_si128(line2, lumMask), 1)); packVal = _mm_packus_epi16(packVal, packVal); *(LPUINT)(lumPlane+lumPos0) = packVal.m128i_u32[0]; *(LPUINT)(lumPlane+lumPos1) = packVal.m128i_u32[1]; } //do average, pack UV vals { __m128i addVal = _mm_add_epi64(_mm_and_si128(line1, uvMask), _mm_and_si128(line2, uvMask)); __m128i avgVal = _mm_srai_epi16(_mm_add_epi64(addVal, _mm_shuffle_epi32(addVal, _MM_SHUFFLE(2, 3, 0, 1))), 2); avgVal = _mm_shuffle_epi32(avgVal, _MM_SHUFFLE(3, 1, 2, 0)); *(LPUINT)(uvPlane+uvPos) = _mm_packus_epi16(avgVal, avgVal).m128i_u32[0]; } } } }
/* * mixed endian increment, low 64bits stored in hi word to be compatible * with _icm's BSWAP. */ static inline __m128i nextc(__m128i x) { const __m128i ONE = _mm_setr_epi32(0, 0, 1, 0); const __m128i ZERO = _mm_setzero_si128(); x = _mm_add_epi64(x, ONE); __m128i t = _mm_cmpeq_epi64(x, ZERO); t = _mm_unpackhi_epi64(t, ZERO); x = _mm_sub_epi64(x, t); return x; }
unsigned int luma_sse2(const uint8_t *pSrc, intptr_t nSrcPitch) { __m128i sum = zeroes; for (unsigned y = 0; y < height; y++) { for (unsigned x = 0; x < width; x += 16) { __m128i src; if (width == 4) src = _mm_cvtsi32_si128(*(const int *)pSrc); else if (width == 8) src = _mm_loadl_epi64((const __m128i *)pSrc); else src = _mm_loadu_si128((const __m128i *)&pSrc[x]); sum = _mm_add_epi64(sum, _mm_sad_epu8(src, zeroes)); } pSrc += nSrcPitch; } if (width >= 16) sum = _mm_add_epi64(sum, _mm_srli_si128(sum, 8)); return (unsigned)_mm_cvtsi128_si32(sum); }
inline void MoveNext(int s) { if(s == 0) return; else if(s < 4){ auto v = uv, st = uvStep; while(s--) { v = _mm_add_epi64(v, st); } uv = v; } else { // no SSE2 support for 64bit multiply, but // this isn't a big problem because this case is rare uvU += stepU * s; uvV += stepV * s; } }
SSE_FUNCTION static void sad8x8_u8_sse (uint32_t *dest, uint8_t *src1, int sstr1, uint8_t *src2, int sstr2) { int i; __m128i sum = _mm_setzero_si128(); union m128_int sumi; for (i = 0; i < 4; i++) { __m128i xmm0, xmm1, xmm2, xmm3; xmm0 = _mm_loadl_epi64((__m128i *)src1); xmm1 = _mm_loadl_epi64((__m128i *)(src1 + sstr1)); xmm2 = _mm_loadl_epi64((__m128i *)src2); xmm3 = _mm_loadl_epi64((__m128i *)(src2 + sstr2)); xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); xmm2 = _mm_unpacklo_epi8(xmm2, xmm3); sum = _mm_add_epi64(sum, _mm_sad_epu8(xmm0, xmm2)); src1 += 2 * sstr1; src2 += 2 * sstr2; } sumi.m128 = sum; *dest = sumi.i[0] + sumi.i[2]; }
int64_t get_sum_vectorised (int64_t * vector) { __m128i sum = _mm_setzero_si128(); int64_t actualSum = 0; for (int64_t i = 0; i < g_length/4*4; i += 4) { __m128i temp = _mm_loadu_si128((__m128i *)(vector + i)); sum = _mm_add_epi64(sum, temp); } int64_t A[4] = {0,0,0,0}; _mm_storeu_si128((__m128i *)A, sum); actualSum += A[0] + A[1] + A[2] + A[3]; for (int64_t i = g_length/4*4; i < g_length; i++) { actualSum += vector[i]; } return actualSum; }
static inline void ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq) { int i; uint16_t rx_id; volatile union ixgbe_adv_rx_desc *rxdp; struct ixgbe_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start]; struct rte_mbuf *mb0, *mb1; __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM); __m128i dma_addr0, dma_addr1; const __m128i hba_msk = _mm_set_epi64x(0, UINT64_MAX); rxdp = rxq->rx_ring + rxq->rxrearm_start; /* Pull 'n' more MBUFs into the software ring */ if (rte_mempool_get_bulk(rxq->mb_pool, (void *)rxep, RTE_IXGBE_RXQ_REARM_THRESH) < 0) { if (rxq->rxrearm_nb + RTE_IXGBE_RXQ_REARM_THRESH >= rxq->nb_rx_desc) { dma_addr0 = _mm_setzero_si128(); for (i = 0; i < RTE_IXGBE_DESCS_PER_LOOP; i++) { rxep[i].mbuf = &rxq->fake_mbuf; _mm_store_si128((__m128i *)&rxdp[i].read, dma_addr0); } } rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += RTE_IXGBE_RXQ_REARM_THRESH; return; } /* Initialize the mbufs in vector, process 2 mbufs in one loop */ for (i = 0; i < RTE_IXGBE_RXQ_REARM_THRESH; i += 2, rxep += 2) { __m128i vaddr0, vaddr1; uintptr_t p0, p1; mb0 = rxep[0].mbuf; mb1 = rxep[1].mbuf; /* * Flush mbuf with pkt template. * Data to be rearmed is 6 bytes long. * Though, RX will overwrite ol_flags that are coming next * anyway. So overwrite whole 8 bytes with one load: * 6 bytes of rearm_data plus first 2 bytes of ol_flags. */ p0 = (uintptr_t)&mb0->rearm_data; *(uint64_t *)p0 = rxq->mbuf_initializer; p1 = (uintptr_t)&mb1->rearm_data; *(uint64_t *)p1 = rxq->mbuf_initializer; /* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */ vaddr0 = _mm_loadu_si128((__m128i *)&(mb0->buf_addr)); vaddr1 = _mm_loadu_si128((__m128i *)&(mb1->buf_addr)); /* convert pa to dma_addr hdr/data */ dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1); /* add headroom to pa values */ dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room); /* set Header Buffer Address to zero */ dma_addr0 = _mm_and_si128(dma_addr0, hba_msk); dma_addr1 = _mm_and_si128(dma_addr1, hba_msk); /* flush desc with pa dma_addr */ _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0); _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1); } rxq->rxrearm_start += RTE_IXGBE_RXQ_REARM_THRESH; if (rxq->rxrearm_start >= rxq->nb_rx_desc) rxq->rxrearm_start = 0; rxq->rxrearm_nb -= RTE_IXGBE_RXQ_REARM_THRESH; rx_id = (uint16_t) ((rxq->rxrearm_start == 0) ? (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); /* Update the tail pointer on the NIC */ IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id); }
HashReturn final_echo(hashState_echo *state, BitSequence *hashval) { __m128i remainingbits; // Add remaining bytes in the buffer state->processed_bits += state->uBufferBytes * 8; remainingbits = _mm_set_epi32(0, 0, 0, state->uBufferBytes * 8); // Pad with 0x80 state->buffer[state->uBufferBytes++] = 0x80; // Enough buffer space for padding in this block? if((state->uBlockLength - state->uBufferBytes) >= 18) { // Pad with zeros memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18)); // Hash size *((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize; // Processed bits *((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits; *((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0; // Last block contains message bits? if(state->uBufferBytes == 1) { state->k = _mm_xor_si128(state->k, state->k); state->k = _mm_sub_epi64(state->k, state->const1536); } else { state->k = _mm_add_epi64(state->k, remainingbits); state->k = _mm_sub_epi64(state->k, state->const1536); } // Compress Compress(state, state->buffer, 1); } else { // Fill with zero and compress memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes); state->k = _mm_add_epi64(state->k, remainingbits); state->k = _mm_sub_epi64(state->k, state->const1536); Compress(state, state->buffer, 1); // Last block memset(state->buffer, 0, state->uBlockLength - 18); // Hash size *((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize; // Processed bits *((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits; *((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0; // Compress the last block state->k = _mm_xor_si128(state->k, state->k); state->k = _mm_sub_epi64(state->k, state->const1536); Compress(state, state->buffer, 1); } // Store the hash value _mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]); _mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]); if(state->uHashSize == 512) { _mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]); _mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]); } return SUCCESS; }
void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount) { unsigned int r, b, i, j; __m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp; __m128i _state[4][4], _state2[4][4], _statebackup[4][4]; for(i = 0; i < 4; i++) for(j = 0; j < ctx->uHashSize / 256; j++) _state[i][j] = ctx->state[i][j]; #if HAVE_AES_NI // transform cv for(i = 0; i < 4; i++) for(j = 0; j < ctx->uHashSize / 256; j++) { TRANSFORM(_state[i][j], _k_ipt, t1, t2); } #endif for(b = 0; b < uBlockCount; b++) { ctx->k = _mm_add_epi64(ctx->k, ctx->const1536); // load message for(j = ctx->uHashSize / 256; j < 4; j++) { for(i = 0; i < 4; i++) { _state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i); #if HAVE_AES_NI // transform message TRANSFORM(_state[i][j], _k_ipt, t1, t2); #endif } } // save state SAVESTATE(_statebackup, _state); k1 = ctx->k; #if HAVE_AES_NI for(r = 0; r < ctx->uRounds / 2; r++) { ECHO_ROUND_UNROLL2; } #else for(r = 0; r < ctx->uRounds / 2; r++) { _state2[0][0] = M128(zero); _state2[1][0] = M128(zero); _state2[2][0] = M128(zero); _state2[3][0] = M128(zero); _state2[0][1] = M128(zero); _state2[1][1] = M128(zero); _state2[2][1] = M128(zero); _state2[3][1] = M128(zero); _state2[0][2] = M128(zero); _state2[1][2] = M128(zero); _state2[2][2] = M128(zero); _state2[3][2] = M128(zero); _state2[0][3] = M128(zero); _state2[1][3] = M128(zero); _state2[2][3] = M128(zero); _state2[3][3] = M128(zero); ECHO_SUB_AND_MIX(_state, 0, 0, _state2, 0, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state, 1, 0, _state2, 3, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state, 2, 0, _state2, 2, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state, 3, 0, _state2, 1, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state, 0, 1, _state2, 1, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state, 1, 1, _state2, 0, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state, 2, 1, _state2, 3, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state, 3, 1, _state2, 2, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state, 0, 2, _state2, 2, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state, 1, 2, _state2, 1, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state, 2, 2, _state2, 0, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state, 3, 2, _state2, 3, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state, 0, 3, _state2, 3, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state, 1, 3, _state2, 2, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state, 2, 3, _state2, 1, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state, 3, 3, _state2, 0, 3, 0, 1, 2); _state[0][0] = M128(zero); _state[1][0] = M128(zero); _state[2][0] = M128(zero); _state[3][0] = M128(zero); _state[0][1] = M128(zero); _state[1][1] = M128(zero); _state[2][1] = M128(zero); _state[3][1] = M128(zero); _state[0][2] = M128(zero); _state[1][2] = M128(zero); _state[2][2] = M128(zero); _state[3][2] = M128(zero); _state[0][3] = M128(zero); _state[1][3] = M128(zero); _state[2][3] = M128(zero); _state[3][3] = M128(zero); ECHO_SUB_AND_MIX(_state2, 0, 0, _state, 0, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state2, 1, 0, _state, 3, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state2, 2, 0, _state, 2, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state2, 3, 0, _state, 1, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state2, 0, 1, _state, 1, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state2, 1, 1, _state, 0, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state2, 2, 1, _state, 3, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state2, 3, 1, _state, 2, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state2, 0, 2, _state, 2, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state2, 1, 2, _state, 1, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state2, 2, 2, _state, 0, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state2, 3, 2, _state, 3, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state2, 0, 3, _state, 3, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state2, 1, 3, _state, 2, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state2, 2, 3, _state, 1, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state2, 3, 3, _state, 0, 3, 0, 1, 2); } #endif if(ctx->uHashSize == 256) { for(i = 0; i < 4; i++) { _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]); _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]); } } else { for(i = 0; i < 4; i++) { _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); _state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]); _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]); } } pmsg += ctx->uBlockLength; } #if HAVE_AES_NI // transform state for(i = 0; i < 4; i++) for(j = 0; j < 4; j++) { TRANSFORM(_state[i][j], _k_opt, t1, t2); } #endif SAVESTATE(ctx->state, _state); }
uint32_t FLAC__fixed_compute_best_predictor_wide_intrin_sse2(const FLAC__int32 data[], uint32_t data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]) { FLAC__uint64 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4; uint32_t i, order; __m128i total_err0, total_err1, total_err3; { FLAC__int32 itmp; __m128i last_error, zero = _mm_setzero_si128(); last_error = _mm_cvtsi32_si128(data[-1]); // 0 0 0 le0 itmp = data[-2]; last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0)); last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // 0 0 le0 le1 itmp -= data[-3]; last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0)); last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // 0 le0 le1 le2 itmp -= data[-3] - data[-4]; last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0)); last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // le0 le1 le2 le3 total_err0 = total_err1 = total_err3 = _mm_setzero_si128(); for(i = 0; i < data_len; i++) { __m128i err0, err1, tmp; err0 = _mm_cvtsi32_si128(data[i]); // 0 0 0 e0 err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0)); // e0 e0 e0 e0 #if 1 /* OPT_SSE */ err1 = _mm_sub_epi32(err1, last_error); last_error = _mm_srli_si128(last_error, 4); // 0 le0 le1 le2 err1 = _mm_sub_epi32(err1, last_error); last_error = _mm_srli_si128(last_error, 4); // 0 0 le0 le1 err1 = _mm_sub_epi32(err1, last_error); last_error = _mm_srli_si128(last_error, 4); // 0 0 0 le0 err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4 #else last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8)); // le0 le1 le2+le0 le3+le1 last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4)); // le0 le1+le0 le2+le0+le1 le3+le1+le2+le0 err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4 #endif tmp = _mm_slli_si128(err0, 12); // e0 0 0 0 last_error = _mm_srli_si128(err1, 4); // 0 e1 e2 e3 last_error = _mm_or_si128(last_error, tmp); // e0 e1 e2 e3 tmp = _mm_srai_epi32(err0, 31); err0 = _mm_xor_si128(err0, tmp); err0 = _mm_sub_epi32(err0, tmp); tmp = _mm_srai_epi32(err1, 31); err1 = _mm_xor_si128(err1, tmp); err1 = _mm_sub_epi32(err1, tmp); total_err0 = _mm_add_epi64(total_err0, err0); // 0 te0 err0 = _mm_unpacklo_epi32(err1, zero); // 0 |e3| 0 |e4| err1 = _mm_unpackhi_epi32(err1, zero); // 0 |e1| 0 |e2| total_err3 = _mm_add_epi64(total_err3, err0); // te3 te4 total_err1 = _mm_add_epi64(total_err1, err1); // te1 te2 } } m128i_to_i64(total_error_0, total_err0); m128i_to_i64(total_error_4, total_err3); m128i_to_i64(total_error_2, total_err1); total_err3 = _mm_srli_si128(total_err3, 8); // 0 te3 total_err1 = _mm_srli_si128(total_err1, 8); // 0 te1 m128i_to_i64(total_error_3, total_err3); m128i_to_i64(total_error_1, total_err1); /* prefer higher order */ if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4)) order = 0; else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4)) order = 1; else if(total_error_2 < flac_min(total_error_3, total_error_4)) order = 2; else if(total_error_3 < total_error_4) order = 3; else order = 4; /* Estimate the expected number of bits per residual signal sample. */ /* 'total_error*' is linearly related to the variance of the residual */ /* signal, so we use it directly to compute E(|x|) */ FLAC__ASSERT(data_len > 0 || total_error_0 == 0); FLAC__ASSERT(data_len > 0 || total_error_1 == 0); FLAC__ASSERT(data_len > 0 || total_error_2 == 0); FLAC__ASSERT(data_len > 0 || total_error_3 == 0); FLAC__ASSERT(data_len > 0 || total_error_4 == 0); residual_bits_per_sample[0] = (float)((total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0); residual_bits_per_sample[1] = (float)((total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0); residual_bits_per_sample[2] = (float)((total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0); residual_bits_per_sample[3] = (float)((total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0); residual_bits_per_sample[4] = (float)((total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0); return order; }
/***************************************************************************** * This function utilises 3 properties of the cost function lookup tables, * * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * * vp9_encoder.c. * * For the joint cost: * * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * * For the component costs: * * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * * (Equal costs for both components) * * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * * (Cost function is even) * * If these do not hold, then this function cannot be used without * * modification, in which case you can revert to using the C implementation, * * which does not rely on these properties. * *****************************************************************************/ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max); const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int); const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min); const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int); const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]); const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]); // search_param determines the length of the initial step and hence the number // of iterations. // 0 = initial step (MAX_FIRST_STEP) pel // 1 = (MAX_FIRST_STEP/2) pel, // 2 = (MAX_FIRST_STEP/4) pel... const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; const int tot_steps = cfg->total_steps - search_param; const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int); const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int); const int what_stride = x->plane[0].src.stride; const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; const uint8_t *const what = x->plane[0].src.buf; const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; // Work out the start point for the search const uint8_t *best_address = in_what; const uint8_t *new_best_address = best_address; #if ARCH_X86_64 __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif unsigned int best_sad; int i; int j; int step; // Check the prerequisite cost function properties that are easy to check // in an assert. See the function-level documentation for details on all // prerequisites. assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); // Check the starting position best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); *num00 = 0; for (i = 0, step = 0; step < tot_steps; step++) { for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { __m128i v_sad_d; __m128i v_cost_d; __m128i v_outside_d; __m128i v_inside_d; __m128i v_diff_mv_w; #if ARCH_X86_64 __m128i v_blocka[2]; #else __m128i v_blocka[1]; #endif // Compute the candidate motion vectors const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]); const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w); // Clamp them to the search bounds __m128i v_these_mv_clamp_w = v_these_mv_w; v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w); v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w); // The ones that did not change are inside the search area v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w); // If none of them are inside, then move on if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) { continue; } // The inverse mask indicates which of the MVs are outside v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff)); // Shift right to keep the sign bit clear, we will use this later // to set the cost to the maximum value. v_outside_d = _mm_srli_epi32(v_outside_d, 1); // Compute the difference MV v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv); // We utilise the fact that the cost function is even, and use the // absolute difference. This allows us to use unsigned indexes later // and reduces cache pressure somewhat as only a half of the table // is ever referenced. v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w); // Compute the SIMD pointer offsets. { #if ARCH_X86_64 // sizeof(intptr_t) == 8 // Load the offsets __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]); __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]); // Set the ones falling outside to zero v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d)); v_bo32_q = _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d)); // Compute the candidate addresses v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q); v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q); #else // ARCH_X86 // sizeof(intptr_t) == 4 __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]); v_bo_d = _mm_and_si128(v_bo_d, v_inside_d); v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d); #endif } fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], in_what_stride, (uint32_t*)&v_sad_d); // Look up the component cost of the residual motion vector { const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0); const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1); const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2); const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3); const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4); const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5); const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6); const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7); // Note: This is a use case for vpgather in AVX2 const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0]; const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1]; const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2]; const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3]; __m128i v_cost_10_d, v_cost_32_d; v_cost_10_d = _mm_cvtsi32_si128(cost0); v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1); v_cost_32_d = _mm_cvtsi32_si128(cost2); v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1); v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d); } // Now add in the joint cost { const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128()); const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d); v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d); } // Multiply by sad_per_bit v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d); // ROUND_POWER_OF_TWO(v_cost_d, 8) v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80)); v_cost_d = _mm_srai_epi32(v_cost_d, 8); // Add the cost to the sad v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d); // Make the motion vectors outside the search area have max cost // by or'ing in the comparison mask, this way the minimum search won't // pick them. v_sad_d = _mm_or_si128(v_sad_d, v_outside_d); // Find the minimum value and index horizontally in v_sad_d { // Try speculatively on 16 bits, so we can use the minpos intrinsic const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d); const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w); uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0); uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1); // If the local best value is not saturated, just use it, otherwise // find the horizontal minimum again the hard way on 32 bits. // This is executed rarely. if (__unlikely__(local_best_sad == 0xffff)) { __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; v_loval_d = v_sad_d; v_loidx_d = _mm_set_epi32(3, 2, 1, 0); v_hival_d = _mm_srli_si128(v_loval_d, 8); v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); v_hival_d = _mm_srli_si128(v_loval_d, 4); v_hiidx_d = _mm_srli_si128(v_loidx_d, 4); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); local_best_sad = _mm_extract_epi32(v_loval_d, 0); local_best_idx = _mm_extract_epi32(v_loidx_d, 0); } // Update the global minimum if the local minimum is smaller if (__likely__(local_best_sad < best_sad)) { new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; best_sad = local_best_sad; } } } bmv = new_bmv; best_address = new_best_address; v_bmv_w = _mm_set1_epi32(bmv.as_int); #if ARCH_X86_64 v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif if (__unlikely__(best_address == in_what)) { (*num00)++; } } *best_mv = bmv.as_mv; return best_sad; }
/** ******************************************************************************* * * @brief * Performs spatial edge adaptive filtering * * @par Description * Performs spatial edge adaptive filtering by detecting edge direction * * @param[in] pu1_src * Source buffer * * @param[in] pu1_out * Destination buffer * * @param[in] src_strd * Source stride * * @param[in] out_strd * Destination stride * @returns * None * * @remarks * ******************************************************************************* */ void ideint_spatial_filter_ssse3(UWORD8 *pu1_src, UWORD8 *pu1_out, WORD32 src_strd, WORD32 out_strd) { WORD32 i; WORD32 adiff[6]; WORD32 *pi4_diff; WORD32 shifts[2]; WORD32 dir_45_le_90, dir_45_le_135, dir_135_le_90; __m128i row1_0, row1_m1, row1_p1; __m128i row2_0, row2_m1, row2_p1; __m128i diff, diffs[3]; __m128i zero; /*****************************************************************/ /* Direction detection */ /*****************************************************************/ zero = _mm_setzero_si128(); diffs[0] = _mm_setzero_si128(); diffs[1] = _mm_setzero_si128(); diffs[2] = _mm_setzero_si128(); /* Load source */ row1_m1 = _mm_loadl_epi64((__m128i *) (pu1_src - 1)); row1_0 = _mm_loadl_epi64((__m128i *) (pu1_src)); row1_p1 = _mm_loadl_epi64((__m128i *) (pu1_src + 1)); pu1_src += src_strd; /* Unpack to 16 bits */ row1_m1 = _mm_unpacklo_epi8(row1_m1, zero); row1_0 = _mm_unpacklo_epi8(row1_0, zero); row1_p1 = _mm_unpacklo_epi8(row1_p1, zero); /*****************************************************************/ /* Calculating the difference along each of the 3 directions. */ /*****************************************************************/ for(i = 0; i < SUB_BLK_HT; i ++) { row2_m1 = _mm_loadl_epi64((__m128i *) (pu1_src - 1)); row2_0 = _mm_loadl_epi64((__m128i *) (pu1_src)); row2_p1 = _mm_loadl_epi64((__m128i *) (pu1_src + 1)); pu1_src += src_strd; /* Unpack to 16 bits */ row2_m1 = _mm_unpacklo_epi8(row2_m1, zero); row2_0 = _mm_unpacklo_epi8(row2_0, zero); row2_p1 = _mm_unpacklo_epi8(row2_p1, zero); diff = _mm_sad_epu8(row1_0, row2_0); diffs[0] = _mm_add_epi64(diffs[0], diff); diff = _mm_sad_epu8(row1_m1, row2_p1); diffs[1] = _mm_add_epi64(diffs[1], diff); diff = _mm_sad_epu8(row1_p1, row2_m1); diffs[2] = _mm_add_epi64(diffs[2], diff); row1_m1 = row2_m1; row1_0 = row2_0; row1_p1 = row2_p1; } /* Revert pu1_src increment */ pu1_src -= (SUB_BLK_HT + 1) * src_strd; adiff[0] = _mm_cvtsi128_si32(diffs[0]); adiff[1] = _mm_cvtsi128_si32(diffs[1]); adiff[2] = _mm_cvtsi128_si32(diffs[2]); adiff[3] = _mm_cvtsi128_si32(_mm_srli_si128(diffs[0], 8)); adiff[4] = _mm_cvtsi128_si32(_mm_srli_si128(diffs[1], 8)); adiff[5] = _mm_cvtsi128_si32(_mm_srli_si128(diffs[2], 8)); pi4_diff = adiff; for(i = 0; i < 2; i++) { /*****************************************************************/ /* Applying bias, to make the diff comparision more robust. */ /*****************************************************************/ pi4_diff[0] *= EDGE_BIAS_0; pi4_diff[1] *= EDGE_BIAS_1; pi4_diff[2] *= EDGE_BIAS_1; /*****************************************************************/ /* comapring the diffs */ /*****************************************************************/ dir_45_le_90 = (pi4_diff[2] <= pi4_diff[0]); dir_45_le_135 = (pi4_diff[2] <= pi4_diff[1]); dir_135_le_90 = (pi4_diff[1] <= pi4_diff[0]); /*****************************************************************/ /* Direction selection. */ /*****************************************************************/ shifts[i] = 0; if(1 == dir_45_le_135) { if(1 == dir_45_le_90) shifts[i] = 1; } else { if(1 == dir_135_le_90) shifts[i] = -1; } pi4_diff += 3; } /*****************************************************************/ /* Directional interpolation */ /*****************************************************************/ for(i = 0; i < SUB_BLK_HT / 2; i++) { __m128i dst; __m128i row1, row2; UWORD32 *pu4_row1th, *pu4_row1tl; UWORD32 *pu4_row2th, *pu4_row2tl; UWORD32 *pu4_row1bh, *pu4_row1bl; UWORD32 *pu4_row2bh, *pu4_row2bl; pu4_row1th = (UWORD32 *)(pu1_src + shifts[0]); pu4_row1tl = (UWORD32 *)(pu1_src + SUB_BLK_WD + shifts[1]); pu1_src += src_strd; pu4_row2th = (UWORD32 *)(pu1_src + shifts[0]); pu4_row2tl = (UWORD32 *)(pu1_src + SUB_BLK_WD + shifts[1]); pu4_row1bh = (UWORD32 *)(pu1_src - shifts[0]); pu4_row1bl = (UWORD32 *)(pu1_src + SUB_BLK_WD - shifts[1]); pu1_src += src_strd; pu4_row2bh = (UWORD32 *)(pu1_src - shifts[0]); pu4_row2bl = (UWORD32 *)(pu1_src + SUB_BLK_WD - shifts[1]); row1 = _mm_set_epi32(*pu4_row1tl, *pu4_row1th, *pu4_row2tl, *pu4_row2th); row2 = _mm_set_epi32(*pu4_row1bl, *pu4_row1bh, *pu4_row2bl, *pu4_row2bh); dst = _mm_avg_epu8(row1, row2); _mm_storel_epi64((__m128i *)pu1_out, _mm_srli_si128(dst, 8)); pu1_out += out_strd; _mm_storel_epi64((__m128i *)pu1_out, dst); pu1_out += out_strd; } }
mlib_status __mlib_VectorSumAbsDiff_S32_Sat( mlib_d64 *z, const mlib_s32 *x, const mlib_s32 *y, mlib_s32 n) { if (n <= 0) return (MLIB_FAILURE); mlib_s32 i, nstep, ax, ay, n1, n2, n3; mlib_s32 *px = (mlib_s32 *)x, *py = (mlib_s32 *)y; __m128i zero, xbuf, ybuf, zbuf, xlo, xhi, mext; mlib_d64 dsum = 0.0; zero = _mm_setzero_si128(); zbuf = zero; nstep = 16 / sizeof (mlib_s32); ax = (mlib_addr)x & 15; ay = (mlib_addr)y & 15; n1 = ((16 - ax) & 15) / sizeof (mlib_s32); n2 = (n - n1) / nstep; n3 = n - n1 - n2 * nstep; if (n2 < 1) { for (i = 0; i < n; i++) { dsum += mlib_fabs((mlib_d64)(*px++) - (*py++)); } *z = dsum; } else { for (i = 0; i < n1; i++) { dsum += mlib_fabs((mlib_d64)(*px++) - (*py++)); } if (ax == ay) { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_load_si128((__m128i *)py); mext = _mm_cmpgt_epi32(ybuf, xbuf); xbuf = _mm_sub_epi32(xbuf, ybuf); xbuf = _mm_xor_si128(xbuf, mext); xbuf = _mm_sub_epi32(xbuf, mext); xlo = _mm_unpacklo_epi32(xbuf, zero); xhi = _mm_unpackhi_epi32(xbuf, zero); zbuf = _mm_add_epi64(zbuf, xlo); zbuf = _mm_add_epi64(zbuf, xhi); px += nstep; py += nstep; } } else { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_loadu_si128((__m128i *)py); mext = _mm_cmpgt_epi32(ybuf, xbuf); xbuf = _mm_sub_epi32(xbuf, ybuf); xbuf = _mm_xor_si128(xbuf, mext); xbuf = _mm_sub_epi32(xbuf, mext); xlo = _mm_unpacklo_epi32(xbuf, zero); xhi = _mm_unpackhi_epi32(xbuf, zero); zbuf = _mm_add_epi64(zbuf, xlo); zbuf = _mm_add_epi64(zbuf, xhi); px += nstep; py += nstep; } } for (i = 0; i < n3; i++) { dsum += mlib_fabs((mlib_d64)(*px++) - (*py++)); } long long pz[2]; _mm_storeu_si128((__m128i *)pz, zbuf); dsum += pz[0]; dsum += pz[1]; *z = dsum; } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorSumAbsDiff_S16_Sat( mlib_d64 *z, const mlib_s16 *x, const mlib_s16 *y, mlib_s32 n) { if (n <= 0) return (MLIB_FAILURE); mlib_s32 i, nstep, ax, ay, n1, n2, n3, xval, sum = 0; mlib_s16 *px = (mlib_s16 *)x, *py = (mlib_s16 *)y; __m128i zero, xbuf, ybuf, zbuf32, zbuf64, xlo, xhi, mext; zero = _mm_setzero_si128(); zbuf64 = zero; nstep = 16 / sizeof (mlib_s16); ax = (mlib_addr)x & 15; ay = (mlib_addr)y & 15; n1 = ((16 - ax) & 15) / sizeof (mlib_s16); n2 = (n - n1) / nstep; n3 = n - n1 - n2 * nstep; if (n2 < 1) { for (i = 0; i < n; i++) { xval = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(xval); } *z = sum; } else { for (i = 0; i < n1; i++) { xval = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(xval); } mlib_s32 nblock = n2 >> 12; mlib_s32 tail = n2 & 4095; mlib_s32 k; if (ax == ay) { for (k = 0; k < nblock; k++) { zbuf32 = zero; for (i = 0; i < 4096; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_load_si128((__m128i *)py); mext = _mm_cmpgt_epi16(ybuf, xbuf); xbuf = _mm_sub_epi16(xbuf, ybuf); xbuf = _mm_xor_si128(xbuf, mext); xbuf = _mm_sub_epi16(xbuf, mext); xlo = _mm_unpacklo_epi16(xbuf, zero); xhi = _mm_unpackhi_epi16(xbuf, zero); zbuf32 = _mm_add_epi32(zbuf32, xlo); zbuf32 = _mm_add_epi32(zbuf32, xhi); px += nstep; py += nstep; } xlo = _mm_unpacklo_epi32(zbuf32, zero); xhi = _mm_unpackhi_epi32(zbuf32, zero); zbuf64 = _mm_add_epi64(zbuf64, xlo); zbuf64 = _mm_add_epi64(zbuf64, xhi); } zbuf32 = zero; for (i = 0; i < tail; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_load_si128((__m128i *)py); mext = _mm_cmpgt_epi16(ybuf, xbuf); xbuf = _mm_sub_epi16(xbuf, ybuf); xbuf = _mm_xor_si128(xbuf, mext); xbuf = _mm_sub_epi16(xbuf, mext); xlo = _mm_unpacklo_epi16(xbuf, zero); xhi = _mm_unpackhi_epi16(xbuf, zero); zbuf32 = _mm_add_epi32(zbuf32, xlo); zbuf32 = _mm_add_epi32(zbuf32, xhi); px += nstep; py += nstep; } xlo = _mm_unpacklo_epi32(zbuf32, zero); xhi = _mm_unpackhi_epi32(zbuf32, zero); zbuf64 = _mm_add_epi64(zbuf64, xlo); zbuf64 = _mm_add_epi64(zbuf64, xhi); } else { /* not aligned */ for (k = 0; k < nblock; k++) { zbuf32 = zero; for (i = 0; i < 4096; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_loadu_si128((__m128i *)py); mext = _mm_cmpgt_epi16(ybuf, xbuf); xbuf = _mm_sub_epi16(xbuf, ybuf); xbuf = _mm_xor_si128(xbuf, mext); xbuf = _mm_sub_epi16(xbuf, mext); xlo = _mm_unpacklo_epi16(xbuf, zero); xhi = _mm_unpackhi_epi16(xbuf, zero); zbuf32 = _mm_add_epi32(zbuf32, xlo); zbuf32 = _mm_add_epi32(zbuf32, xhi); px += nstep; py += nstep; } xlo = _mm_unpacklo_epi32(zbuf32, zero); xhi = _mm_unpackhi_epi32(zbuf32, zero); zbuf64 = _mm_add_epi64(zbuf64, xlo); zbuf64 = _mm_add_epi64(zbuf64, xhi); } zbuf32 = zero; for (i = 0; i < tail; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_loadu_si128((__m128i *)py); mext = _mm_cmpgt_epi16(ybuf, xbuf); xbuf = _mm_sub_epi16(xbuf, ybuf); xbuf = _mm_xor_si128(xbuf, mext); xbuf = _mm_sub_epi16(xbuf, mext); xlo = _mm_unpacklo_epi16(xbuf, zero); xhi = _mm_unpackhi_epi16(xbuf, zero); zbuf32 = _mm_add_epi32(zbuf32, xlo); zbuf32 = _mm_add_epi32(zbuf32, xhi); px += nstep; py += nstep; } xlo = _mm_unpacklo_epi32(zbuf32, zero); xhi = _mm_unpackhi_epi32(zbuf32, zero); zbuf64 = _mm_add_epi64(zbuf64, xlo); zbuf64 = _mm_add_epi64(zbuf64, xhi); } for (i = 0; i < n3; i++) { xval = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(xval); } mlib_d64 dsum = sum; long long pz[2]; _mm_storeu_si128((__m128i *)pz, zbuf64); dsum += pz[0]; dsum += pz[1]; *z = dsum; } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorSumAbsDiff_S8_Sat( mlib_d64 *z, const mlib_s8 *x, const mlib_s8 *y, mlib_s32 n) { if (n <= 0) return (MLIB_FAILURE); mlib_s32 i, nstep, ax, ay, n1, n2, n3, diff, sum = 0; mlib_s8 *px = (mlib_s8 *)x, *py = (mlib_s8 *)y; __m128i zero, xbuf, ybuf, zbuf, mext, mbuf; zero = _mm_setzero_si128(); zbuf = zero; nstep = 16 / sizeof (mlib_s8); ax = (mlib_addr)x & 15; ay = (mlib_addr)y & 15; n1 = ((16 - ax) & 15) / sizeof (mlib_s8); n2 = (n - n1) / nstep; n3 = n - n1 - n2 * nstep; if (n2 < 1) { for (i = 0; i < n; i++) { diff = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(diff); } *z = sum; } else { for (i = 0; i < n1; i++) { diff = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(diff); } if (ax == ay) { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_load_si128((__m128i *)py); mext = _mm_cmpgt_epi8(ybuf, xbuf); mbuf = _mm_sub_epi8(xbuf, ybuf); mbuf = _mm_xor_si128(mbuf, mext); mbuf = _mm_sub_epi8(mbuf, mext); mbuf = _mm_sad_epu8(mbuf, zero); zbuf = _mm_add_epi64(zbuf, mbuf); px += nstep; py += nstep; } } else { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_loadu_si128((__m128i *)py); mext = _mm_cmpgt_epi8(ybuf, xbuf); mbuf = _mm_sub_epi8(xbuf, ybuf); mbuf = _mm_xor_si128(mbuf, mext); mbuf = _mm_sub_epi8(mbuf, mext); mbuf = _mm_sad_epu8(mbuf, zero); zbuf = _mm_add_epi64(zbuf, mbuf); px += nstep; py += nstep; } } for (i = 0; i < n3; i++) { diff = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(diff); } mlib_d64 dsum = sum; long long pz[2]; _mm_storeu_si128((__m128i *)pz, zbuf); dsum += pz[0]; dsum += pz[1]; *z = dsum; } return (MLIB_SUCCESS); }
void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps) { const unsigned default_partition_samples = (residual_samples + predictor_order) >> max_partition_order; unsigned partitions = 1u << max_partition_order; FLAC__ASSERT(default_partition_samples > predictor_order); /* first do max_partition_order */ { const unsigned threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples); unsigned partition, residual_sample, end = (unsigned)(-(int)predictor_order); if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) { for(partition = residual_sample = 0; partition < partitions; partition++) { __m128i mm_sum = _mm_setzero_si128(); unsigned e1, e3; end += default_partition_samples; e1 = (residual_sample + 3) & ~3; e3 = end & ~3; if(e1 > end) e1 = end; /* try flac -l 1 -b 16 and you'll be here */ /* assumption: residual[] is properly aligned so (residual + e1) is properly aligned too and _mm_loadu_si128() is fast */ for( ; residual_sample < e1; residual_sample++) { __m128i mm_res = _mm_cvtsi32_si128(residual[residual_sample]); __m128i mm_mask = _mm_srai_epi32(mm_res, 31); mm_res = _mm_xor_si128(mm_res, mm_mask); mm_res = _mm_sub_epi32(mm_res, mm_mask); /* abs(INT_MIN) is undefined, but if the residual is INT_MIN we have bigger problems */ mm_sum = _mm_add_epi32(mm_sum, mm_res); } for( ; residual_sample < e3; residual_sample+=4) { __m128i mm_res = _mm_loadu_si128((const __m128i*)(residual+residual_sample)); __m128i mm_mask = _mm_srai_epi32(mm_res, 31); mm_res = _mm_xor_si128(mm_res, mm_mask); mm_res = _mm_sub_epi32(mm_res, mm_mask); mm_sum = _mm_add_epi32(mm_sum, mm_res); } for( ; residual_sample < end; residual_sample++) { __m128i mm_res = _mm_cvtsi32_si128(residual[residual_sample]); __m128i mm_mask = _mm_srai_epi32(mm_res, 31); mm_res = _mm_xor_si128(mm_res, mm_mask); mm_res = _mm_sub_epi32(mm_res, mm_mask); mm_sum = _mm_add_epi32(mm_sum, mm_res); } mm_sum = _mm_add_epi32(mm_sum, _mm_srli_si128(mm_sum, 8)); mm_sum = _mm_add_epi32(mm_sum, _mm_srli_si128(mm_sum, 4)); abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum); } } else { /* have to pessimistically use 64 bits for accumulator */ for(partition = residual_sample = 0; partition < partitions; partition++) { __m128i mm_sum = _mm_setzero_si128(); unsigned e1, e3; end += default_partition_samples; e1 = (residual_sample + 1) & ~1; e3 = end & ~1; FLAC__ASSERT(e1 <= end); for( ; residual_sample < e1; residual_sample++) { __m128i mm_res = _mm_cvtsi32_si128(residual[residual_sample]); /* 0 0 0 r0 */ __m128i mm_mask = _mm_srai_epi32(mm_res, 31); mm_res = _mm_xor_si128(mm_res, mm_mask); mm_res = _mm_sub_epi32(mm_res, mm_mask); /* 0 0 0 |r0| == 00 |r0_64| */ mm_sum = _mm_add_epi64(mm_sum, mm_res); } for( ; residual_sample < e3; residual_sample+=2) { __m128i mm_res = _mm_loadl_epi64((const __m128i*)(residual+residual_sample)); /* 0 0 r1 r0 */ __m128i mm_mask = _mm_srai_epi32(mm_res, 31); mm_res = _mm_xor_si128(mm_res, mm_mask); mm_res = _mm_sub_epi32(mm_res, mm_mask); /* 0 0 |r1| |r0| */ mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0 |r1| 0 |r0| == |r1_64| |r0_64| */ mm_sum = _mm_add_epi64(mm_sum, mm_res); } for( ; residual_sample < end; residual_sample++) { __m128i mm_res = _mm_cvtsi32_si128(residual[residual_sample]); __m128i mm_mask = _mm_srai_epi32(mm_res, 31); mm_res = _mm_xor_si128(mm_res, mm_mask); mm_res = _mm_sub_epi32(mm_res, mm_mask); mm_sum = _mm_add_epi64(mm_sum, mm_res); } mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8)); _mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), mm_sum); } } } /* now merge partitions for lower orders */ { unsigned from_partition = 0, to_partition = partitions; int partition_order; for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) { unsigned i; partitions >>= 1; for(i = 0; i < partitions; i++) { abs_residual_partition_sums[to_partition++] = abs_residual_partition_sums[from_partition ] + abs_residual_partition_sums[from_partition+1]; from_partition += 2; } } } }