int32_t dot_product(int16_t *x, int16_t *y, uint32_t N, //must be a multiple of 8 uint8_t output_shift) { uint32_t n; #if defined(__x86_64__) || defined(__i386__) __m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im; __m64 mmtmp7; __m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1); int32_t result; x128 = (__m128i*) x; y128 = (__m128i*) y; mmcumul_re = _mm_setzero_si128(); mmcumul_im = _mm_setzero_si128(); for (n=0; n<(N>>2); n++) { //printf("n=%d, x128=%p, y128=%p\n",n,x128,y128); // print_shorts("x",&x128[0]); // print_shorts("y",&y128[0]); // this computes Re(z) = Re(x)*Re(y) + Im(x)*Im(y) mmtmp1 = _mm_madd_epi16(x128[0],y128[0]); // print_ints("re",&mmtmp1); // mmtmp1 contains real part of 4 consecutive outputs (32-bit) // shift and accumulate results mmtmp1 = _mm_srai_epi32(mmtmp1,output_shift); mmcumul_re = _mm_add_epi32(mmcumul_re,mmtmp1); // print_ints("re",&mmcumul_re); // this computes Im(z) = Re(x)*Im(y) - Re(y)*Im(x) mmtmp2 = _mm_shufflelo_epi16(y128[0],_MM_SHUFFLE(2,3,0,1)); // print_shorts("y",&mmtmp2); mmtmp2 = _mm_shufflehi_epi16(mmtmp2,_MM_SHUFFLE(2,3,0,1)); // print_shorts("y",&mmtmp2); mmtmp2 = _mm_sign_epi16(mmtmp2,minus_i); // print_shorts("y",&mmtmp2); mmtmp3 = _mm_madd_epi16(x128[0],mmtmp2); // print_ints("im",&mmtmp3); // mmtmp3 contains imag part of 4 consecutive outputs (32-bit) // shift and accumulate results mmtmp3 = _mm_srai_epi32(mmtmp3,output_shift); mmcumul_im = _mm_add_epi32(mmcumul_im,mmtmp3); // print_ints("im",&mmcumul_im); x128++; y128++; } // this gives Re Re Im Im mmcumul = _mm_hadd_epi32(mmcumul_re,mmcumul_im); // print_ints("cumul1",&mmcumul); // this gives Re Im Re Im mmcumul = _mm_hadd_epi32(mmcumul,mmcumul); // print_ints("cumul2",&mmcumul); //mmcumul = _mm_srai_epi32(mmcumul,output_shift); // extract the lower half mmtmp7 = _mm_movepi64_pi64(mmcumul); // print_ints("mmtmp7",&mmtmp7); // pack the result mmtmp7 = _mm_packs_pi32(mmtmp7,mmtmp7); // print_shorts("mmtmp7",&mmtmp7); // convert back to integer result = _mm_cvtsi64_si32(mmtmp7); _mm_empty(); _m_empty(); return(result); #elif defined(__arm__) int16x4_t *x_128=(int16x4_t*)x; int16x4_t *y_128=(int16x4_t*)y; int32x4_t tmp_re,tmp_im; int32x4_t tmp_re1,tmp_im1; int32x4_t re_cumul,im_cumul; int32x2_t re_cumul2,im_cumul2; int32x4_t shift = vdupq_n_s32(-output_shift); int32x2x2_t result2; int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ; re_cumul = vdupq_n_s32(0); im_cumul = vdupq_n_s32(0); for (n=0; n<(N>>2); n++) { tmp_re = vmull_s16(*x_128++, *y_128++); //tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])] tmp_re1 = vmull_s16(*x_128++, *y_128++); //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)), vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1))); //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] tmp_im = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)), vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1))); //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift)); im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift)); } re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul)); im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul)); re_cumul2 = vpadd_s32(re_cumul2,re_cumul2); im_cumul2 = vpadd_s32(im_cumul2,im_cumul2); result2 = vzip_s32(re_cumul2,im_cumul2); return(vget_lane_s32(result2.val[0],0)); #endif }
static void SinCos(const float rad, float &sin, float &cos) // #include <emmintrin.h>, #include <xmmintrin.h> { const __m128 _ps_fopi = _mm_set1_ps(4.0f / pi); const __m128 _ps_0p5 = _mm_set1_ps(0.5f); const __m128 _ps_1 = _mm_set1_ps(1.0f); const __m128 _ps_dp1 = _mm_set1_ps(-0.7851562f); const __m128 _ps_dp2 = _mm_set1_ps(-2.4187564849853515625e-4f); const __m128 _ps_dp3 = _mm_set1_ps(-3.77489497744594108e-8f); const __m128 _ps_sincof_p0 = _mm_set1_ps(2.443315711809948e-5f); const __m128 _ps_sincof_p1 = _mm_set1_ps(8.3321608736e-3f); const __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611e-1f); const __m128 _ps_coscof_p0 = _mm_set1_ps(2.443315711809948e-5f); const __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765e-3f); const __m128 _ps_coscof_p2 = _mm_set1_ps(4.166664568298827e-2f); const __m128i _pi32_1 = _mm_set1_epi32(1); const __m128i _pi32_i1 = _mm_set1_epi32(~1); const __m128i _pi32_2 = _mm_set1_epi32(2); const __m128i _pi32_4 = _mm_set1_epi32(4); const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000)); const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000)); __m128 mm1, mm2; __m128i mmi0, mmi2, mmi4; __m128 x, y, z; __m128 y1, y2; __m128 a = _mm_set1_ps(rad); x = _mm_and_ps(a, _mask_sign_inv); y = _mm_mul_ps(x, _ps_fopi); mmi2 = _mm_cvtps_epi32(y); mmi2 = _mm_add_epi32(mmi2, _pi32_1); mmi2 = _mm_and_si128(mmi2, _pi32_i1); y = _mm_cvtepi32_ps(mmi2); mmi4 = mmi2; mmi0 = _mm_and_si128(mmi2, _pi32_4); mmi0 = _mm_slli_epi32(mmi0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(mmi0); mmi2 = _mm_and_si128(mmi2, _pi32_2); mmi2 = _mm_cmpeq_epi32(mmi2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(mmi2); x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp1)); x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp2)); x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp3)); mmi4 = _mm_sub_epi32(mmi4, _pi32_2); mmi4 = _mm_andnot_si128(mmi4, _pi32_4); mmi4 = _mm_slli_epi32(mmi4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(mmi4); __m128 sign_bit_sin = _mm_xor_ps(_mm_and_ps(a, _mask_sign_raw), swap_sign_bit_sin); z = _mm_mul_ps(x, x); y1 = _mm_mul_ps(_ps_coscof_p0, z); y1 = _mm_add_ps(y1, _ps_coscof_p1); y1 = _mm_mul_ps(y1, z); y1 = _mm_add_ps(y1, _ps_coscof_p2); y1 = _mm_mul_ps(y1, z); y1 = _mm_mul_ps(y1, z); y1 = _mm_sub_ps(y1, _mm_mul_ps(z, _ps_0p5)); y1 = _mm_add_ps(y1, _ps_1); y2 = _mm_mul_ps(_ps_sincof_p0, z); y2 = _mm_add_ps(y2, _ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, _ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); __m128 sin1y = _mm_andnot_ps(poly_mask, y1); __m128 sin2y = _mm_and_ps(poly_mask, y2); mm1 = _mm_add_ps(sin1y, sin2y); mm2 = _mm_add_ps(_mm_sub_ps(y1, sin1y), _mm_sub_ps(y2, sin2y)); sin = _mm_cvtss_f32(_mm_xor_ps(mm1, sign_bit_sin)); cos = _mm_cvtss_f32(_mm_xor_ps(mm2, sign_bit_cos)); }
static int cornerScore(const uchar* ptr, const int pixel[], int threshold) { const int K = 8, N = 16 + K + 1; int k, v = ptr[0]; short d[N]; for( k = 0; k < N; k++ ) d[k] = (short)(v - ptr[pixel[k]]); #if CV_SSE2 __m128i q0 = _mm_set1_epi16(-1000), q1 = _mm_set1_epi16(1000); for( k = 0; k < 16; k += 8 ) { __m128i v0 = _mm_loadu_si128((__m128i*)(d+k+1)); __m128i v1 = _mm_loadu_si128((__m128i*)(d+k+2)); __m128i a = _mm_min_epi16(v0, v1); __m128i b = _mm_max_epi16(v0, v1); v0 = _mm_loadu_si128((__m128i*)(d+k+3)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k+4)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k+5)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k+6)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k+7)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k+8)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k)); q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); v0 = _mm_loadu_si128((__m128i*)(d+k+9)); q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); } q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1)); q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0)); q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4)); q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2)); threshold = (short)_mm_cvtsi128_si32(q0) - 1; #else int a0 = threshold; for( k = 0; k < 16; k += 2 ) { int a = std::min((int)d[k+1], (int)d[k+2]); a = std::min(a, (int)d[k+3]); if( a <= a0 ) continue; a = std::min(a, (int)d[k+4]); a = std::min(a, (int)d[k+5]); a = std::min(a, (int)d[k+6]); a = std::min(a, (int)d[k+7]); a = std::min(a, (int)d[k+8]); a0 = std::max(a0, std::min(a, (int)d[k])); a0 = std::max(a0, std::min(a, (int)d[k+9])); } int b0 = -a0; for( k = 0; k < 16; k += 2 ) { int b = std::max((int)d[k+1], (int)d[k+2]); b = std::max(b, (int)d[k+3]); b = std::max(b, (int)d[k+4]); b = std::max(b, (int)d[k+5]); if( b >= b0 ) continue; b = std::max(b, (int)d[k+6]); b = std::max(b, (int)d[k+7]); b = std::max(b, (int)d[k+8]); b0 = std::min(b0, std::max(b, (int)d[k])); b0 = std::min(b0, std::max(b, (int)d[k+9])); } threshold = -b0-1; #endif #if 0 // check that with the computed "threshold" the pixel is still a corner // and that with the increased-by-1 "threshold" the pixel is not a corner anymore for( int delta = 0; delta <= 1; delta++ ) { int v0 = std::min(ptr[0] + threshold + delta, 255); int v1 = std::max(ptr[0] - threshold - delta, 0); int c0 = 0, c1 = 0; for( int k = 0; k < N; k++ ) { int x = ptr[pixel[k]]; if(x > v0) { if( ++c0 > K ) break; c1 = 0; } else if( x < v1 ) { if( ++c1 > K ) break; c0 = 0; } else { c0 = c1 = 0; } } CV_Assert( (delta == 0 && std::max(c0, c1) > K) || (delta == 1 && std::max(c0, c1) <= K) ); } #endif return threshold; }
/* Compute reflection coefficients from input signal */ void silk_burg_modified_sse4_1( opus_int32 *res_nrg, /* O Residual energy */ opus_int *res_nrg_Q, /* O Residual energy Q value */ opus_int32 A_Q16[], /* O Prediction coefficients (length order) */ const opus_int16 x[], /* I Input signal, length: nb_subfr * (D + subfr_length) */ const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */ const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */ const opus_int nb_subfr, /* I Number of subframes stacked in x */ const opus_int D, /* I Order */ int arch /* I Run-time architecture */ ) { opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain; opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2; const opus_int16 *x_ptr; opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ]; opus_int32 C_last_row[ SILK_MAX_ORDER_LPC ]; opus_int32 Af_QA[ SILK_MAX_ORDER_LPC ]; opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ]; opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ]; opus_int32 xcorr[ SILK_MAX_ORDER_LPC ]; __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210; __m128i CONST1 = _mm_set1_epi32(1); silk_assert(subfr_length * nb_subfr <= MAX_FRAME_SIZE); /* Compute autocorrelations, added over subframes */ silk_sum_sqr_shift(&C0, &rshifts, x, nb_subfr * subfr_length); if(rshifts > MAX_RSHIFTS) { C0 = silk_LSHIFT32(C0, rshifts - MAX_RSHIFTS); silk_assert(C0 > 0); rshifts = MAX_RSHIFTS; } else { lz = silk_CLZ32(C0) - 1; rshifts_extra = N_BITS_HEAD_ROOM - lz; if(rshifts_extra > 0) { rshifts_extra = silk_min(rshifts_extra, MAX_RSHIFTS - rshifts); C0 = silk_RSHIFT32(C0, rshifts_extra); } else { rshifts_extra = silk_max(rshifts_extra, MIN_RSHIFTS - rshifts); C0 = silk_LSHIFT32(C0, -rshifts_extra); } rshifts += rshifts_extra; } CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0) + 1; /* Q(-rshifts) */ silk_memset(C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof(opus_int32)); if(rshifts > 0) { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; for(n = 1; n < D + 1; n++) { C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64(x_ptr, x_ptr + n, subfr_length - n, arch), rshifts); } } } else { for(s = 0; s < nb_subfr; s++) { int i; opus_int32 d; x_ptr = x + s * subfr_length; celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch); for(n = 1; n < D + 1; n++) { for (i = n + subfr_length - D, d = 0; i < subfr_length; i++) d = MAC16_16(d, x_ptr[ i ], x_ptr[ i - n ]); xcorr[ n - 1 ] += d; } for(n = 1; n < D + 1; n++) { C_first_row[ n - 1 ] += silk_LSHIFT32(xcorr[ n - 1 ], -rshifts); } } } silk_memcpy(C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof(opus_int32)); /* Initialize */ CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0) + 1; /* Q(-rshifts) */ invGain_Q30 = (opus_int32)1 << 30; reached_max_gain = 0; for(n = 0; n < D; n++) { /* Update first row of correlation matrix (without first element) */ /* Update last row of correlation matrix (without last element, stored in reversed order) */ /* Update C * Af */ /* Update C * flipud(Af) (stored in reversed order) */ if(rshifts > -2) { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; x1 = -silk_LSHIFT32((opus_int32)x_ptr[ n ], 16 - rshifts); /* Q(16-rshifts) */ x2 = -silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts); /* Q(16-rshifts) */ tmp1 = silk_LSHIFT32((opus_int32)x_ptr[ n ], QA - 16); /* Q(QA-16) */ tmp2 = silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16); /* Q(QA-16) */ for(k = 0; k < n; k++) { C_first_row[ k ] = silk_SMLAWB(C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q(-rshifts) */ C_last_row[ k ] = silk_SMLAWB(C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ]); /* Q(-rshifts) */ Atmp_QA = Af_QA[ k ]; tmp1 = silk_SMLAWB(tmp1, Atmp_QA, x_ptr[ n - k - 1 ] ); /* Q(QA-16) */ tmp2 = silk_SMLAWB(tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ]); /* Q(QA-16) */ } tmp1 = silk_LSHIFT32(-tmp1, 32 - QA - rshifts); /* Q(16-rshifts) */ tmp2 = silk_LSHIFT32(-tmp2, 32 - QA - rshifts); /* Q(16-rshifts) */ for(k = 0; k <= n; k++) { CAf[ k ] = silk_SMLAWB(CAf[ k ], tmp1, x_ptr[ n - k ] ); /* Q(-rshift) */ CAb[ k ] = silk_SMLAWB(CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ]); /* Q(-rshift) */ } } } else { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; x1 = -silk_LSHIFT32((opus_int32)x_ptr[ n ], -rshifts); /* Q(-rshifts) */ x2 = -silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts); /* Q(-rshifts) */ tmp1 = silk_LSHIFT32((opus_int32)x_ptr[ n ], 17); /* Q17 */ tmp2 = silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], 17); /* Q17 */ X1_3210 = _mm_set1_epi32(x1); X2_3210 = _mm_set1_epi32(x2); TMP1_3210 = _mm_setzero_si128(); TMP2_3210 = _mm_setzero_si128(); for(k = 0; k < n - 3; k += 4) { PTR_3210 = OP_CVTEPI16_EPI32_M64(&x_ptr[ n - k - 1 - 3 ]); SUBFR_3210 = OP_CVTEPI16_EPI32_M64(&x_ptr[ subfr_length - n + k ]); FIRST_3210 = _mm_loadu_si128((__m128i *)&C_first_row[ k ]); PTR_3210 = _mm_shuffle_epi32(PTR_3210, _MM_SHUFFLE(0, 1, 2, 3)); LAST_3210 = _mm_loadu_si128((__m128i *)&C_last_row[ k ]); ATMP_3210 = _mm_loadu_si128((__m128i *)&Af_QA[ k ]); T1_3210 = _mm_mullo_epi32(PTR_3210, X1_3210); T2_3210 = _mm_mullo_epi32(SUBFR_3210, X2_3210); ATMP_3210 = _mm_srai_epi32(ATMP_3210, 7); ATMP_3210 = _mm_add_epi32(ATMP_3210, CONST1); ATMP_3210 = _mm_srai_epi32(ATMP_3210, 1); FIRST_3210 = _mm_add_epi32(FIRST_3210, T1_3210); LAST_3210 = _mm_add_epi32(LAST_3210, T2_3210); PTR_3210 = _mm_mullo_epi32(ATMP_3210, PTR_3210); SUBFR_3210 = _mm_mullo_epi32(ATMP_3210, SUBFR_3210); _mm_storeu_si128((__m128i *)&C_first_row[ k ], FIRST_3210); _mm_storeu_si128((__m128i *)&C_last_row[ k ], LAST_3210); TMP1_3210 = _mm_add_epi32(TMP1_3210, PTR_3210); TMP2_3210 = _mm_add_epi32(TMP2_3210, SUBFR_3210); } TMP1_3210 = _mm_add_epi32(TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210)); TMP2_3210 = _mm_add_epi32(TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210)); TMP1_3210 = _mm_add_epi32(TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E)); TMP2_3210 = _mm_add_epi32(TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E)); tmp1 += _mm_cvtsi128_si32(TMP1_3210); tmp2 += _mm_cvtsi128_si32(TMP2_3210); for(; k < n; k++) { C_first_row[ k ] = silk_MLA(C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q(-rshifts) */ C_last_row[ k ] = silk_MLA(C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ]); /* Q(-rshifts) */ Atmp1 = silk_RSHIFT_ROUND(Af_QA[ k ], QA - 17); /* Q17 */ tmp1 = silk_MLA(tmp1, x_ptr[ n - k - 1 ], Atmp1); /* Q17 */ tmp2 = silk_MLA(tmp2, x_ptr[ subfr_length - n + k ], Atmp1); /* Q17 */ } tmp1 = -tmp1; /* Q17 */ tmp2 = -tmp2; /* Q17 */ { __m128i xmm_tmp1, xmm_tmp2; __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1; __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1; xmm_tmp1 = _mm_set1_epi32(tmp1); xmm_tmp2 = _mm_set1_epi32(tmp2); for(k = 0; k <= n - 3; k += 4) { xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64(&x_ptr[ n - k - 3 ]); xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64(&x_ptr[ subfr_length - n + k - 1 ]); xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32(xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE(0, 1, 2, 3)); xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32(xmm_x_ptr_n_k_x2x0, -rshifts - 1); xmm_x_ptr_sub_x2x0 = _mm_slli_epi32(xmm_x_ptr_sub_x2x0, -rshifts - 1); /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/ xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32(xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE(0, 3, 2, 1)); xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32(xmm_x_ptr_sub_x2x0, _MM_SHUFFLE(0, 3, 2, 1)); xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32(xmm_x_ptr_n_k_x2x0, xmm_tmp1); xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32(xmm_x_ptr_n_k_x3x1, xmm_tmp1); xmm_x_ptr_sub_x2x0 = _mm_mul_epi32(xmm_x_ptr_sub_x2x0, xmm_tmp2); xmm_x_ptr_sub_x3x1 = _mm_mul_epi32(xmm_x_ptr_sub_x3x1, xmm_tmp2); xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64(xmm_x_ptr_n_k_x2x0, 16); xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64(xmm_x_ptr_n_k_x3x1, 16); xmm_x_ptr_sub_x2x0 = _mm_srli_epi64(xmm_x_ptr_sub_x2x0, 16); xmm_x_ptr_sub_x3x1 = _mm_slli_epi64(xmm_x_ptr_sub_x3x1, 16); xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16(xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC); xmm_x_ptr_sub_x2x0 = _mm_blend_epi16(xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC); X1_3210 = _mm_loadu_si128((__m128i *)&CAf[ k ]); PTR_3210 = _mm_loadu_si128((__m128i *)&CAb[ k ]); X1_3210 = _mm_add_epi32(X1_3210, xmm_x_ptr_n_k_x2x0); PTR_3210 = _mm_add_epi32(PTR_3210, xmm_x_ptr_sub_x2x0); _mm_storeu_si128((__m128i *)&CAf[ k ], X1_3210); _mm_storeu_si128((__m128i *)&CAb[ k ], PTR_3210); } for(; k <= n; k++) { CAf[ k ] = silk_SMLAWW(CAf[ k ], tmp1, silk_LSHIFT32((opus_int32)x_ptr[ n - k ], -rshifts - 1)); /* Q(-rshift) */ CAb[ k ] = silk_SMLAWW(CAb[ k ], tmp2, silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1)); /* Q(-rshift) */ } } } } /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */ tmp1 = C_first_row[ n ]; /* Q(-rshifts) */ tmp2 = C_last_row[ n ]; /* Q(-rshifts) */ num = 0; /* Q(-rshifts) */ nrg = silk_ADD32(CAb[ 0 ], CAf[ 0 ]); /* Q(1-rshifts) */ for(k = 0; k < n; k++) { Atmp_QA = Af_QA[ k ]; lz = silk_CLZ32(silk_abs(Atmp_QA)) - 1; lz = silk_min(32 - QA, lz); Atmp1 = silk_LSHIFT32(Atmp_QA, lz); /* Q(QA + lz) */ tmp1 = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(C_last_row[ n - k - 1 ], Atmp1), 32 - QA - lz); /* Q(-rshifts) */ tmp2 = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(C_first_row[ n - k - 1 ], Atmp1), 32 - QA - lz); /* Q(-rshifts) */ num = silk_ADD_LSHIFT32(num, silk_SMMUL(CAb[ n - k ], Atmp1), 32 - QA - lz); /* Q(-rshifts) */ nrg = silk_ADD_LSHIFT32(nrg, silk_SMMUL(silk_ADD32(CAb[ k + 1 ], CAf[ k + 1 ]), Atmp1), 32 - QA - lz); /* Q(1-rshifts) */ } CAf[ n + 1 ] = tmp1; /* Q(-rshifts) */ CAb[ n + 1 ] = tmp2; /* Q(-rshifts) */ num = silk_ADD32(num, tmp2); /* Q(-rshifts) */ num = silk_LSHIFT32(-num, 1); /* Q(1-rshifts) */ /* Calculate the next order reflection (parcor) coefficient */ if(silk_abs(num) < nrg) { rc_Q31 = silk_DIV32_varQ(num, nrg, 31); } else { rc_Q31 = (num > 0) ? silk_int32_MAX : silk_int32_MIN; } /* Update inverse prediction gain */ tmp1 = ((opus_int32)1 << 30) - silk_SMMUL(rc_Q31, rc_Q31); tmp1 = silk_LSHIFT(silk_SMMUL(invGain_Q30, tmp1), 2); if(tmp1 <= minInvGain_Q30) { /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */ tmp2 = ((opus_int32)1 << 30) - silk_DIV32_varQ(minInvGain_Q30, invGain_Q30, 30); /* Q30 */ rc_Q31 = silk_SQRT_APPROX(tmp2); /* Q15 */ /* Newton-Raphson iteration */ rc_Q31 = silk_RSHIFT32(rc_Q31 + silk_DIV32(tmp2, rc_Q31), 1); /* Q15 */ rc_Q31 = silk_LSHIFT32(rc_Q31, 16); /* Q31 */ if(num < 0) { /* Ensure adjusted reflection coefficients has the original sign */ rc_Q31 = -rc_Q31; } invGain_Q30 = minInvGain_Q30; reached_max_gain = 1; } else { invGain_Q30 = tmp1; } /* Update the AR coefficients */ for(k = 0; k < (n + 1) >> 1; k++) { tmp1 = Af_QA[ k ]; /* QA */ tmp2 = Af_QA[ n - k - 1 ]; /* QA */ Af_QA[ k ] = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(tmp2, rc_Q31), 1); /* QA */ Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(tmp1, rc_Q31), 1); /* QA */ } Af_QA[ n ] = silk_RSHIFT32(rc_Q31, 31 - QA); /* QA */ if(reached_max_gain) { /* Reached max prediction gain; set remaining coefficients to zero and exit loop */ for(k = n + 1; k < D; k++) { Af_QA[ k ] = 0; } break; } /* Update C * Af and C * Ab */ for(k = 0; k <= n + 1; k++) { tmp1 = CAf[ k ]; /* Q(-rshifts) */ tmp2 = CAb[ n - k + 1 ]; /* Q(-rshifts) */ CAf[ k ] = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(tmp2, rc_Q31), 1); /* Q(-rshifts) */ CAb[ n - k + 1 ] = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(tmp1, rc_Q31), 1); /* Q(-rshifts) */ } } if(reached_max_gain) { for(k = 0; k < D; k++) { /* Scale coefficients */ A_Q16[ k ] = -silk_RSHIFT_ROUND(Af_QA[ k ], QA - 16); } /* Subtract energy of preceding samples from C0 */ if(rshifts > 0) { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; C0 -= (opus_int32)silk_RSHIFT64(silk_inner_prod16_aligned_64(x_ptr, x_ptr, D, arch), rshifts); } } else { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; C0 -= silk_LSHIFT32(silk_inner_prod_aligned(x_ptr, x_ptr, D, arch), -rshifts); } } /* Approximate residual energy */ *res_nrg = silk_LSHIFT(silk_SMMUL(invGain_Q30, C0), 2); *res_nrg_Q = -rshifts; } else { /* Return residual energy */ nrg = CAf[ 0 ]; /* Q(-rshifts) */ tmp1 = (opus_int32)1 << 16; /* Q16 */ for(k = 0; k < D; k++) { Atmp1 = silk_RSHIFT_ROUND(Af_QA[ k ], QA - 16); /* Q16 */ nrg = silk_SMLAWW(nrg, CAf[ k + 1 ], Atmp1); /* Q(-rshifts) */ tmp1 = silk_SMLAWW(tmp1, Atmp1, Atmp1); /* Q16 */ A_Q16[ k ] = -Atmp1; } *res_nrg = silk_SMLAWW(nrg, silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0), -tmp1);/* Q(-rshifts) */ *res_nrg_Q = -rshifts; } }
void minmax_vec2(const uint32_t n, float const* buf, uint32_t* idx_min_, uint32_t* idx_max_, float* min_, float* max_) { // We suppose that pointers are aligned on an 16-byte boundary // Initialise SSE registers __m128i sse_idx_min = _mm_setzero_si128(); __m128i sse_idx_max = _mm_setzero_si128(); __m128 sse_min = _mm_set1_ps(FLT_MAX); __m128 sse_max = _mm_set1_ps(FLT_MIN); // We will unroll the for-loop by for, thus doing // (n/4) iterations. const uint32_t n_sse = n & ~3ULL; __m128i sse_idx = _mm_set_epi32(3, 2, 1, 0); const __m128i sse_4 = _mm_set1_epi32(4); for (uint32_t i = 0; i < n_sse; i += 4) { const __m128 sse_v = _mm_load_ps(&buf[i]); const __m128 sse_cmp_min = _mm_cmplt_ps(sse_v, sse_min); const __m128 sse_cmp_max = _mm_cmpgt_ps(sse_v, sse_max); sse_min = _mm_blendv_ps(sse_min, sse_v, sse_cmp_min); sse_max = _mm_blendv_ps(sse_max, sse_v, sse_cmp_max); sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx, (__m128) sse_cmp_min); sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx, (__m128) sse_cmp_max); sse_idx = _mm_add_epi32(sse_idx, sse_4); } // SSE reduction __m128 sse_min_permute = _mm_shuffle_epi32(sse_min, 2 | (3<<2)); __m128 sse_max_permute = _mm_shuffle_epi32(sse_max, 2 | (3<<2)); __m128i sse_idx_min_permute = _mm_shuffle_epi32(sse_idx_min, 2 | (3<<2)); __m128i sse_idx_max_permute = _mm_shuffle_epi32(sse_idx_max, 2 | (3<<2)); __m128 sse_cmp_min = _mm_cmplt_ps(sse_min_permute, sse_min); __m128 sse_cmp_max = _mm_cmpgt_ps(sse_max_permute, sse_max); sse_min = _mm_blendv_ps(sse_min, sse_min_permute, sse_cmp_min); sse_max = _mm_blendv_ps(sse_max, sse_max_permute, sse_cmp_max); sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx_min_permute, (__m128) sse_cmp_min); sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx_max_permute, (__m128) sse_cmp_max); sse_min_permute = _mm_shuffle_epi32(sse_min, 1); sse_max_permute = _mm_shuffle_epi32(sse_max, 1); sse_idx_min_permute = _mm_shuffle_epi32(sse_idx_min, 1); sse_idx_max_permute = _mm_shuffle_epi32(sse_idx_max, 1); sse_cmp_min = _mm_cmplt_ps(sse_min_permute, sse_min); sse_cmp_max = _mm_cmpgt_ps(sse_max_permute, sse_max); sse_min = _mm_blendv_ps(sse_min, sse_min_permute, sse_cmp_min); sse_max = _mm_blendv_ps(sse_max, sse_max_permute, sse_cmp_max); sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx_min_permute, (__m128) sse_cmp_min); sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx_max_permute, (__m128) sse_cmp_max); // Epilogue float min, max; uint32_t idx_min, idx_max; _mm_store_ss(&min, sse_min); _mm_store_ss(&max, sse_max); idx_min = _mm_extract_epi32(sse_idx_min, 0); idx_max = _mm_extract_epi32(sse_idx_max, 0); for (uint32_t i = n_sse; i < n; i++) { const float v = buf[i]; if (v < min) { min = v; idx_min = i; } if (v > max) { max = v; idx_max = i; } } *idx_min_ = idx_min; *min_ = min; *idx_max_ = idx_max; *max_ = max; }
/* Function: p7_SSVFilter_longtarget() * Synopsis: Finds windows with SSV scores above some threshold (vewy vewy fast, in limited precision) * * Purpose: Calculates an approximation of the SSV (single ungapped diagonal) * score for regions of sequence <dsq> of length <L> residues, using * optimized profile <om>, and a preallocated one-row DP matrix <ox>, * and captures the positions at which such regions exceed the score * required to be significant in the eyes of the calling function, * which depends on the <bg> and <p> (usually p=0.02 for nhmmer). * Note that this variant performs only SSV computations, never * passing through the J state - the score required to pass SSV at * the default threshold (or less restrictive) is sufficient to * pass MSV in essentially all DNA models we've tested. * * Above-threshold diagonals are captured into a preallocated list * <windowlist>. Rather than simply capturing positions at which a * score threshold is reached, this function establishes windows * around those high-scoring positions, using scores in <msvdata>. * These windows can be merged by the calling function. * * * Args: dsq - digital target sequence, 1..L * L - length of dsq in residues * om - optimized profile * ox - DP matrix * msvdata - compact representation of substitution scores, for backtracking diagonals * bg - the background model, required for translating a P-value threshold into a score threshold * P - p-value below which a region is captured as being above threshold * windowlist - preallocated container for all hits (resized if necessary) * * * Note: We misuse the matrix <ox> here, using only a third of the * first dp row, accessing it as <dp[0..Q-1]> rather than * in triplets via <{MDI}MX(q)> macros, since we only need * to store M state values. We know that if <ox> was big * enough for normal DP calculations, it must be big enough * to hold the MSVFilter calculation. * * Returns: <eslOK> on success. * * Throws: <eslEINVAL> if <ox> allocation is too small. */ int p7_SSVFilter_longtarget(const ESL_DSQ *dsq, int L, P7_OPROFILE *om, P7_OMX *ox, const P7_SCOREDATA *msvdata, P7_BG *bg, double P, P7_HMM_WINDOWLIST *windowlist) { register __m128i mpv; /* previous row values */ register __m128i xEv; /* E state: keeps max for Mk->E for a single iteration */ register __m128i xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ register __m128i sv; /* temp storage of 1 curr row value in progress */ register __m128i biasv; /* emission bias in a vector */ uint8_t xJ; /* special states' scores */ int i; /* counter over sequence positions 1..L */ int q; /* counter over vectors 0..nq-1 */ int Q = p7O_NQB(om->M); /* segment length: # of vectors */ __m128i *dp = ox->dpb[0]; /* we're going to use dp[0][0..q..Q-1], not {MDI}MX(q) macros*/ __m128i *rsc; /* will point at om->rbv[x] for residue x[i] */ __m128i tecv; /* vector for E->C cost */ __m128i tjbmv; /* vector for J->B move cost + B->M move costs */ __m128i basev; /* offset for scores */ __m128i ceilingv; /* saturated simd value used to test for overflow */ __m128i tempv; /* work vector */ int cmp; int k; int n; int end; int rem_sc; int start; int target_end; int target_start; int max_end; int max_sc; int sc; int pos_since_max; float ret_sc; union { __m128i v; uint8_t b[16]; } u; /* * Computing the score required to let P meet the F1 prob threshold * In original code, converting from a scaled int MSV * score S (the score getting to state E) to a probability goes like this: * usc = S - om->tec_b - om->tjb_b - om->base_b; * usc /= om->scale_b; * usc -= 3.0; * P = f ( (usc - nullsc) / eslCONST_LOG2 , mu, lambda) * and we're computing the threshold usc, so reverse it: * (usc - nullsc) / eslCONST_LOG2 = inv_f( P, mu, lambda) * usc = nullsc + eslCONST_LOG2 * inv_f( P, mu, lambda) * usc += 3 * usc *= om->scale_b * S = usc + om->tec_b + om->tjb_b + om->base_b * * Here, I compute threshold with length model based on max_length. Doesn't * matter much - in any case, both the bg and om models will change with roughly * 1 bit for each doubling of the length model, so they offset. */ float nullsc; __m128i sc_threshv; uint8_t sc_thresh; float invP = esl_gumbel_invsurv(P, om->evparam[p7_MMU], om->evparam[p7_MLAMBDA]); /* Check that the DP matrix is ok for us. */ if (Q > ox->allocQ16) ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small"); ox->M = om->M; p7_bg_SetLength(bg, om->max_length); p7_oprofile_ReconfigMSVLength(om, om->max_length); p7_bg_NullOne (bg, dsq, om->max_length, &nullsc); sc_thresh = (int) ceil( ( ( nullsc + (invP * eslCONST_LOG2) + 3.0 ) * om->scale_b ) + om->base_b + om->tec_b + om->tjb_b ); sc_threshv = _mm_set1_epi8((int8_t) 255 - sc_thresh); /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base. */ biasv = _mm_set1_epi8((int8_t) om->bias_b); /* yes, you can set1() an unsigned char vector this way */ ceilingv = _mm_cmpeq_epi8(biasv, biasv); for (q = 0; q < Q; q++) dp[q] = _mm_setzero_si128(); xJ = 0; basev = _mm_set1_epi8((int8_t) om->base_b); tecv = _mm_set1_epi8((int8_t) om->tec_b); tjbmv = _mm_set1_epi8((int8_t) om->tjb_b + (int8_t) om->tbm_b); xBv = _mm_subs_epu8(basev, tjbmv); for (i = 1; i <= L; i++) { rsc = om->rbv[dsq[i]]; xEv = _mm_setzero_si128(); /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12. * Because ia32 is littlendian, this means a left bit shift. * Zeros shift on automatically, which is our -infinity. */ mpv = _mm_slli_si128(dp[Q-1], 1); for (q = 0; q < Q; q++) { /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */ sv = _mm_max_epu8(mpv, xBv); sv = _mm_adds_epu8(sv, biasv); sv = _mm_subs_epu8(sv, *rsc); rsc++; xEv = _mm_max_epu8(xEv, sv); mpv = dp[q]; /* Load {MDI}(i-1,q) into mpv */ dp[q] = sv; /* Do delayed store of M(i,q) now that memory is usable */ } /* test if the pthresh significance threshold has been reached; * note: don't use _mm_cmpgt_epi8, because it's a signed comparison, which won't work on uint8s */ tempv = _mm_adds_epu8(xEv, sc_threshv); tempv = _mm_cmpeq_epi8(tempv, ceilingv); cmp = _mm_movemask_epi8(tempv); if (cmp != 0) { //hit pthresh, so add position to list and reset values //figure out which model state hit threshold end = -1; rem_sc = -1; for (q = 0; q < Q; q++) { /// Unpack and unstripe, so we can find the state that exceeded pthresh u.v = dp[q]; for (k = 0; k < 16; k++) { // unstripe //(q+Q*k+1) is the model position k at which the xE score is found if (u.b[k] >= sc_thresh && u.b[k] > rem_sc && (q+Q*k+1) <= om->M) { end = (q+Q*k+1); rem_sc = u.b[k]; } } dp[q] = _mm_set1_epi8(0); // while we're here ... this will cause values to get reset to xB in next dp iteration } //recover the diagonal that hit threshold start = end; target_end = target_start = i; sc = rem_sc; while (rem_sc > om->base_b - om->tjb_b - om->tbm_b) { rem_sc -= om->bias_b - msvdata->msv_scores[start*om->abc->Kp + dsq[target_start]]; --start; --target_start; } start++; target_start++; //extend diagonal further with single diagonal extension k = end+1; n = target_end+1; max_end = target_end; max_sc = sc; pos_since_max = 0; while (k<om->M && n<=L) { sc += om->bias_b - msvdata->msv_scores[k*om->abc->Kp + dsq[n]]; if (sc >= max_sc) { max_sc = sc; max_end = n; pos_since_max=0; } else { pos_since_max++; if (pos_since_max == 5) break; } k++; n++; } end += (max_end - target_end); k += (max_end - target_end); target_end = max_end; ret_sc = ((float) (max_sc - om->tjb_b) - (float) om->base_b); ret_sc /= om->scale_b; ret_sc -= 3.0; // that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ p7_hmmwindow_new(windowlist, 0, target_start, k, end, end-start+1 , ret_sc, p7_NOCOMPLEMENT ); i = target_end; // skip forward } } /* end loop over sequence residues 1..L */ return eslOK; }
template<> void momentsInTile<uchar, int, int>( const cv::Mat& img, double* moments ) { typedef uchar T; typedef int WT; typedef int MT; Size size = img.size(); int y; MT mom[10] = {0,0,0,0,0,0,0,0,0,0}; bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); for( y = 0; y < size.height; y++ ) { const T* ptr = img.ptr<T>(y); int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x = 0; if( useSIMD ) { __m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); __m128i dx = _mm_set1_epi16(8); __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init; for( ; x <= size.width - 8; x += 8 ) { __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z); qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); __m128i px = _mm_mullo_epi16(p, qx); __m128i sx = _mm_mullo_epi16(qx, qx); qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx)); qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx)); qx3 = _mm_add_epi32(qx3, _mm_madd_epi16(px, sx)); qx = _mm_add_epi16(qx, dx); } int CV_DECL_ALIGNED(16) buf[4]; _mm_store_si128((__m128i*)buf, qx0); x0 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx1); x1 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx2); x2 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx3); x3 = buf[0] + buf[1] + buf[2] + buf[3]; } for( ; x < size.width; x++ ) { WT p = ptr[x]; WT xp = x * p, xxp; x0 += p; x1 += xp; xxp = xp * x; x2 += xxp; x3 += xxp * x; } WT py = y * x0, sy = y*y; mom[9] += ((MT)py) * sy; // m03 mom[8] += ((MT)x1) * sy; // m12 mom[7] += ((MT)x2) * y; // m21 mom[6] += x3; // m30 mom[5] += x0 * sy; // m02 mom[4] += x1 * y; // m11 mom[3] += x2; // m20 mom[2] += py; // m01 mom[1] += x1; // m10 mom[0] += x0; // m00 } for(int x = 0; x < 10; x++ ) moments[x] = (double)mom[x]; }
/*---------------------------------------------------------------------------*/ pstatus_t sse2_yCbCrToRGB_16s16s_P3P3( const INT16 *pSrc[3], int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi) /* region of interest */ { __m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096; __m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf; int srcbump, dstbump, yp, imax; if (((ULONG_PTR) (pSrc[0]) & 0x0f) || ((ULONG_PTR) (pSrc[1]) & 0x0f) || ((ULONG_PTR) (pSrc[2]) & 0x0f) || ((ULONG_PTR) (pDst[0]) & 0x0f) || ((ULONG_PTR) (pDst[1]) & 0x0f) || ((ULONG_PTR) (pDst[2]) & 0x0f) || (roi->width & 0x07) || (srcStep & 127) || (dstStep & 127)) { /* We can't maintain 16-byte alignment. */ return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi); } zero = _mm_setzero_si128(); max = _mm_set1_epi16(255); y_buf = (__m128i*) (pSrc[0]); cb_buf = (__m128i*) (pSrc[1]); cr_buf = (__m128i*) (pSrc[2]); r_buf = (__m128i*) (pDst[0]); g_buf = (__m128i*) (pDst[1]); b_buf = (__m128i*) (pDst[2]); r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */ g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */ g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */ b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */ c4096 = _mm_set1_epi16(4096); srcbump = srcStep / sizeof(__m128i); dstbump = dstStep / sizeof(__m128i); #ifdef DO_PREFETCH /* Prefetch Y's, Cb's, and Cr's. */ for (yp=0; yp<roi->height; yp++) { int i; for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i); i += (CACHE_LINE_BYTES / sizeof(__m128i))) { _mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA); } y_buf += srcbump; cb_buf += srcbump; cr_buf += srcbump; } y_buf = (__m128i*) (pSrc[0]); cb_buf = (__m128i*) (pSrc[1]); cr_buf = (__m128i*) (pSrc[2]); #endif /* DO_PREFETCH */ imax = roi->width * sizeof(INT16) / sizeof(__m128i); for (yp=0; yp<roi->height; ++yp) { int i; for (i=0; i<imax; i++) { /* In order to use SSE2 signed 16-bit integer multiplication * we need to convert the floating point factors to signed int * without losing information. * The result of this multiplication is 32 bit and we have two * SSE instructions that return either the hi or lo word. * Thus we will multiply the factors by the highest possible 2^n, * take the upper 16 bits of the signed 32-bit result * (_mm_mulhi_epi16) and correct this result by multiplying * it by 2^(16-n). * * For the given factors in the conversion matrix the best * possible n is 14. * * Example for calculating r: * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3 */ /* y = (y_r_buf[i] + 4096) >> 2 */ __m128i y, cb, cr, r, g, b; y = _mm_load_si128(y_buf + i); y = _mm_add_epi16(y, c4096); y = _mm_srai_epi16(y, 2); /* cb = cb_g_buf[i]; */ cb = _mm_load_si128(cb_buf + i); /* cr = cr_b_buf[i]; */ cr = _mm_load_si128(cr_buf + i); /* (y + HIWORD(cr*22986)) >> 3 */ r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr)); r = _mm_srai_epi16(r, 3); /* r_buf[i] = MINMAX(r, 0, 255); */ _mm_between_epi16(r, zero, max); _mm_store_si128(r_buf + i, r); /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */ g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb)); g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr)); g = _mm_srai_epi16(g, 3); /* g_buf[i] = MINMAX(g, 0, 255); */ _mm_between_epi16(g, zero, max); _mm_store_si128(g_buf + i, g); /* (y + HIWORD(cb*28999)) >> 3 */ b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb)); b = _mm_srai_epi16(b, 3); /* b_buf[i] = MINMAX(b, 0, 255); */ _mm_between_epi16(b, zero, max); _mm_store_si128(b_buf + i, b); } y_buf += srcbump; cb_buf += srcbump; cr_buf += srcbump; r_buf += dstbump; g_buf += dstbump; b_buf += dstbump; } return PRIMITIVES_SUCCESS; }
void Polyval_Htable(unsigned char* Htbl, unsigned char* inp, int length, unsigned char* POLYVAL) { int remainder =0; int rem_128 = (length%128) - length%16; int has_semi = length %16; unsigned char* fixed_inp = inp; int i; uint8_t B[16] ={0}; __m128i data, TMP0, TMP1, TMP2, TMP3, TMP4, T, Xhi, POLY; if (length==0) return; Xhi = _mm_setzero_si128(); POLY = _mm_setr_epi32(0x1,0,0,0xc2000000); T = _mm_loadu_si128(((__m128i*)POLYVAL)); if ((length!=0) || (rem_128!=0)){ if (rem_128!=0) { fixed_inp +=rem_128; remainder = rem_128/16; data = _mm_loadu_si128(((__m128i*)inp)); data = _mm_xor_si128(T, data); TMP2 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-1], 0x01); TMP0 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-1], 0x00); TMP1 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-1], 0x11); TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-1], 0x10); TMP2 = _mm_xor_si128(TMP2, TMP3); for (i=1; i<(rem_128/16); i++) { data = _mm_loadu_si128(&((__m128i*)inp)[i]); TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-i-1], 0x00); TMP0 = _mm_xor_si128(TMP0, TMP3); TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-i-1], 0x11); TMP1 = _mm_xor_si128(TMP1, TMP3); TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-i-1], 0x01); TMP2 = _mm_xor_si128(TMP2, TMP3); TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-i-1], 0x10); TMP2 = _mm_xor_si128(TMP2, TMP3); } TMP3 = _mm_srli_si128(TMP2, 8); TMP2 = _mm_slli_si128(TMP2, 8); Xhi = _mm_xor_si128(TMP3, TMP1); T = _mm_xor_si128(TMP0, TMP2); length -= rem_128; } length /=16; i=0; if (length!=0) { if (rem_128==0) { data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+7]); TMP2 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x01); TMP0 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x00); TMP1 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x11); TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x10); TMP2 = _mm_xor_si128(TMP2, TMP3); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+6]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[1]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+5]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[2]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+4]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[3]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+3]); TMP4 = _mm_clmulepi64_si128(T, POLY, 0x10); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[4]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+2]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[5]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+1]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[6]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i]); data = _mm_xor_si128(T, data); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[7]); TMP3 = _mm_srli_si128(TMP2, 8); TMP2 = _mm_slli_si128(TMP2, 8); Xhi = _mm_xor_si128(TMP3, TMP1); T = _mm_xor_si128(TMP0, TMP2); i=8; } for (; i<length; i=i+8) { data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+7]); TMP2 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x01); TMP0 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x00); TMP1 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x11); TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x10); TMP2 = _mm_xor_si128(TMP2, TMP3); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+6]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[1]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+5]); TMP4 = _mm_clmulepi64_si128(T, POLY, 0x10); T =_mm_alignr_epi8(T, T, 8); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[2]); T = _mm_xor_si128(T, TMP4); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+4]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[3]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+3]); TMP4 = _mm_clmulepi64_si128(T, POLY, 0x10); T =_mm_alignr_epi8(T, T, 8); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[4]); T = _mm_xor_si128(T, TMP4); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+2]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[5]); T = _mm_xor_si128(T, Xhi); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+1]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[6]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i]); data = _mm_xor_si128(T, data); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[7]); TMP3 = _mm_srli_si128(TMP2, 8); TMP2 = _mm_slli_si128(TMP2, 8); Xhi = _mm_xor_si128(TMP3, TMP1); T = _mm_xor_si128(TMP0, TMP2); } TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10); T =_mm_alignr_epi8(T, T, 8); T = _mm_xor_si128(TMP3, T); TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10); T =_mm_alignr_epi8(T, T, 8); T = _mm_xor_si128(TMP3, T); T = _mm_xor_si128(Xhi, T); } else { // length was <16 and there was several blocks on start - need to finialize reduction if (rem_128!=0) { TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10); T =_mm_alignr_epi8(T, T, 8); T = _mm_xor_si128(TMP3, T); TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10); T =_mm_alignr_epi8(T, T, 8); T = _mm_xor_si128(TMP3, T); T = _mm_xor_si128(Xhi, T); } } } if (has_semi!=0) { memcpy(B, (uint8_t*)(&((__m128i*)fixed_inp)[i]),has_semi); data = _mm_loadu_si128((__m128i*)B); data = _mm_xor_si128(T,data); TMP2 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x01); TMP0 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x00); TMP1 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x11); TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x10); TMP2 = _mm_xor_si128(TMP2, TMP3); TMP3 = _mm_srli_si128(TMP2, 8); TMP2 = _mm_slli_si128(TMP2, 8); Xhi = _mm_xor_si128(TMP3, TMP1); T = _mm_xor_si128(TMP0, TMP2); TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10); T =_mm_alignr_epi8(T, T, 8); T = _mm_xor_si128(TMP3, T); TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10); T =_mm_alignr_epi8(T, T, 8); T = _mm_xor_si128(TMP3, T); T = _mm_xor_si128(Xhi, T); } _mm_storeu_si128(((__m128i*)POLYVAL), T); }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *p3 = p2 + bstride; uint16_t *p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 rdiv = _mm_set1_ps((float)ch->rdiv); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i max = _mm_set1_epi32(0xFFFF); __m128 matrix[25]; for (int i = 0; i < 25; i++) { matrix[i] = _mm_set1_ps((float)ch->m[i]); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); uint16_t *array[] = { p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 }; for (int x = 0; x < width; x += 8) { __m128 sum[2] = {(__m128)zero, (__m128)zero}; for (int i = 0; i < 25; i++) { __m128i xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x)); __m128 xmm1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero)); __m128 xmm2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero)); xmm1 = _mm_mul_ps(xmm1, matrix[i]); xmm2 = _mm_mul_ps(xmm2, matrix[i]); sum[0] = _mm_add_ps(sum[0], xmm1); sum[1] = _mm_add_ps(sum[1], xmm2); } __m128i sumi[2]; for (int i = 0; i < 2; i++) { sum[i] = _mm_mul_ps(sum[i], rdiv); sum[i] = _mm_add_ps(sum[i], bias); if (!ch->saturate) { sum[i] = mm_abs_ps(sum[i]); } sumi[i] = _mm_cvtps_epi32(sum[i]); sumi[i] = mm_min_epi32(sumi[i], max); __m128i mask = _mm_cmpgt_epi32(sumi[i], zero); sumi[i] = _mm_and_si128(sumi[i], mask); } sumi[0] = mm_cast_epi32(sumi[0], sumi[1]); _mm_store_si128((__m128i *)(dstp + x), sumi[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *p3 = p2 + bstride; uint8_t *p4 = p3 + bstride; uint8_t *orig = p0, *end = p4; line_copy8(p0, srcp + 2 * stride , width, 2); line_copy8(p1, srcp + stride, width, 2); line_copy8(p2, srcp, width, 2); srcp += stride; line_copy8(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 rdiv = _mm_set1_ps((float)ch->rdiv); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i matrix[25]; for (int i = 0; i < 25; i++) { matrix[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m[i]), zero); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy8(p4, srcp, width, 2); uint8_t *array[] = { p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 }; for (int x = 0; x < width; x += 16) { __m128i sum[4] = { zero, zero, zero, zero }; for (int i = 0; i < 25; i++) { __m128i xmm0, xmm1, xmm2; xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x)); xmm2 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i])); sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i])); xmm1 = _mm_unpackhi_epi16(xmm2, zero); xmm0 = _mm_unpacklo_epi16(xmm2, zero); sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i])); sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i])); } for (int i = 0; i < 4; i++) { __m128 sumfp = _mm_cvtepi32_ps(sum[i]); sumfp = _mm_mul_ps(sumfp, rdiv); sumfp = _mm_add_ps(sumfp, bias); if (!ch->saturate) { sumfp = mm_abs_ps(sumfp); } sum[i] = _mm_cvttps_epi32(sumfp); } sum[0] = _mm_packs_epi32(sum[0], sum[1]); sum[1] = _mm_packs_epi32(sum[2], sum[3]); sum[0] = _mm_packus_epi16(sum[0], sum[1]); _mm_store_si128((__m128i *)(dstp + x), sum[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
static inline void i40e_rxq_rearm(struct i40e_rx_queue *rxq) { int i; uint16_t rx_id; volatile union i40e_rx_desc *rxdp; struct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start]; struct rte_mbuf *mb0, *mb1; __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM); __m128i dma_addr0, dma_addr1; rxdp = rxq->rx_ring + rxq->rxrearm_start; /* Pull 'n' more MBUFs into the software ring */ if (rte_mempool_get_bulk(rxq->mp, (void *)rxep, RTE_I40E_RXQ_REARM_THRESH) < 0) { if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >= rxq->nb_rx_desc) { dma_addr0 = _mm_setzero_si128(); for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) { rxep[i].mbuf = &rxq->fake_mbuf; _mm_store_si128((__m128i *)&rxdp[i].read, dma_addr0); } } rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += RTE_I40E_RXQ_REARM_THRESH; return; } /* Initialize the mbufs in vector, process 2 mbufs in one loop */ for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) { __m128i vaddr0, vaddr1; mb0 = rxep[0].mbuf; mb1 = rxep[1].mbuf; /* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */ vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); /* convert pa to dma_addr hdr/data */ dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1); /* add headroom to pa values */ dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room); /* flush desc with pa dma_addr */ _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0); _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1); } rxq->rxrearm_start += RTE_I40E_RXQ_REARM_THRESH; if (rxq->rxrearm_start >= rxq->nb_rx_desc) rxq->rxrearm_start = 0; rxq->rxrearm_nb -= RTE_I40E_RXQ_REARM_THRESH; rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); /* Update the tail pointer on the NIC */ I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id); }
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them.. it is almost as fast, and gives you a free cosine with your sine */ void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr) { __m128 x=*((__m128 *)xptr), *s=(__m128 *)sptr, *c=(__m128 *)cptr, xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; #ifdef USE_SSE2 __m128i emm0, emm2, emm4; #else __m64 mm0, mm1, mm2, mm3, mm4, mm5; #endif sign_bit_sin = x; /* take the absolute value */ x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); /* extract the sign bit (upper one) */ sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask); /* scale by 4/Pi */ y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); #ifdef USE_SSE2 /* store the integer part of y in emm2 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; /* get the swap sign flag for the sine */ emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); /* get the polynom selection mask for the sine*/ emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(emm2); #else /* store the integer part of y in mm2:mm3 */ xmm3 = _mm_movehl_ps(xmm3, y); mm2 = _mm_cvttps_pi32(y); mm3 = _mm_cvttps_pi32(xmm3); /* j=(j+1) & (~1) (see the cephes sources) */ mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1); mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1); y = _mm_cvtpi32x2_ps(mm2, mm3); mm4 = mm2; mm5 = mm3; /* get the swap sign flag for the sine */ mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4); mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4); mm0 = _mm_slli_pi32(mm0, 29); mm1 = _mm_slli_pi32(mm1, 29); __m128 swap_sign_bit_sin; COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin); /* get the polynom selection mask for the sine */ mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2); mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); __m128 poly_mask; COPY_MM_TO_XMM(mm2, mm3, poly_mask); #endif /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m128*)_ps_minus_cephes_DP1; xmm2 = *(__m128*)_ps_minus_cephes_DP2; xmm3 = *(__m128*)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); #ifdef USE_SSE2 emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2); emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4); emm4 = _mm_slli_epi32(emm4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(emm4); #else /* get the sign flag for the cosine */ mm4 = _mm_sub_pi32(mm4, *(__m64*)_pi32_2); mm5 = _mm_sub_pi32(mm5, *(__m64*)_pi32_2); mm4 = _mm_andnot_si64(mm4, *(__m64*)_pi32_4); mm5 = _mm_andnot_si64(mm5, *(__m64*)_pi32_4); mm4 = _mm_slli_pi32(mm4, 29); mm5 = _mm_slli_pi32(mm5, 29); __m128 sign_bit_cos; COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos); _mm_empty(); /* good-bye mmx */ #endif sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ __m128 z = _mm_mul_ps(x,x); y = *(__m128*)_ps_coscof_p0; y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *(__m128*)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m128 y2 = *(__m128*)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); y2 = _mm_sub_ps(y2,ysin2); y = _mm_sub_ps(y, ysin1); xmm1 = _mm_add_ps(ysin1,ysin2); xmm2 = _mm_add_ps(y,y2); /* update the sign */ *s = _mm_xor_ps(xmm1, sign_bit_sin); *c = _mm_xor_ps(xmm2, sign_bit_cos); }
/* almost the same as sin_ps */ __m128 cos_ps(v4sfu *xPtr) { // any x __m128 x=*((__m128 *)xPtr); __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y; #ifdef USE_SSE2 __m128i emm0, emm2; #else __m64 mm0, mm1, mm2, mm3; #endif /* take the absolute value */ x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); /* scale by 4/Pi */ y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); #ifdef USE_SSE2 /* store the integer part of y in mm0 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm2 = _mm_sub_epi32(emm2, *(__m128i*)_pi32_2); /* get the swap sign flag */ emm0 = _mm_andnot_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); /* get the polynom selection mask */ emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 sign_bit = _mm_castsi128_ps(emm0); __m128 poly_mask = _mm_castsi128_ps(emm2); #else /* store the integer part of y in mm0:mm1 */ xmm2 = _mm_movehl_ps(xmm2, y); mm2 = _mm_cvttps_pi32(y); mm3 = _mm_cvttps_pi32(xmm2); /* j=(j+1) & (~1) (see the cephes sources) */ mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1); mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1); y = _mm_cvtpi32x2_ps(mm2, mm3); mm2 = _mm_sub_pi32(mm2, *(__m64*)_pi32_2); mm3 = _mm_sub_pi32(mm3, *(__m64*)_pi32_2); /* get the swap sign flag in mm0:mm1 and the polynom selection mask in mm2:mm3 */ mm0 = _mm_andnot_si64(mm2, *(__m64*)_pi32_4); mm1 = _mm_andnot_si64(mm3, *(__m64*)_pi32_4); mm0 = _mm_slli_pi32(mm0, 29); mm1 = _mm_slli_pi32(mm1, 29); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2); mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); __m128 sign_bit, poly_mask; COPY_MM_TO_XMM(mm0, mm1, sign_bit); COPY_MM_TO_XMM(mm2, mm3, poly_mask); _mm_empty(); /* good-bye mmx */ #endif /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m128*)_ps_minus_cephes_DP1; xmm2 = *(__m128*)_ps_minus_cephes_DP2; xmm3 = *(__m128*)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(__m128*)_ps_coscof_p0; __m128 z = _mm_mul_ps(x,x); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *(__m128*)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m128 y2 = *(__m128*)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm_and_ps(xmm3, y2); //, xmm3); y = _mm_andnot_ps(xmm3, y); y = _mm_add_ps(y,y2); /* update the sign */ y = _mm_xor_ps(y, sign_bit); return y; }
mlib_status __mlib_VideoP64Loop_S16_U8( mlib_s16 mc_block[64], const mlib_u8 *ref_block, mlib_s32 stride) { const mlib_u8 *sl; mlib_s16 *sd; __m128i txmm0, txmm1, txmm2, txmm3, txmm4, txmm5, txmm6, txmm7; __m128i t0, t1, t2, t3, t4, t5, t6, t7; __m128i Czero, CF, C2, C4, C8; Czero = _mm_setzero_si128(); C2 = _mm_set1_epi16(2); C4 = _mm_set1_epi16(4); C8 = _mm_set1_epi16(8); CF = _mm_set_epi32(0xff0000, 0, 0, 0xff); sd = mc_block; sl = ref_block; LOADL(0); sl += stride; FILTERX(0); LOADL(1); sl += stride; FILTERX(1); STORB(0); sd += 8; ADDL(0, 1); LOADL(2); sl += stride; FILTERX(2); ADDLRND(1, 2); STORSUM(0, 1); sd += 8; LOADL(3); sl += stride; FILTERX(3); ADDL(2, 3); STORSUM(1, 2); sd += 8; LOADL(4); sl += stride; FILTERX(4); ADDLRND(3, 4); STORSUM(2, 3); sd += 8; LOADL(5); sl += stride; FILTERX(5); ADDL(4, 5); STORSUM(3, 4); sd += 8; LOADL(6); sl += stride; FILTERX(6); ADDLRND(5, 6); STORSUM(4, 5); sd += 8; LOADL(7); FILTERX(7); ADDL(6, 7); STORSUM(5, 6); sd += 8; STORB(7); return (MLIB_SUCCESS); }
static void aom_filter_block1d4_v4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32; __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45, srcReg6, srcReg56; __m128i srcReg23_34_lo, srcReg45_56_lo; __m128i srcReg2345_3456_lo, srcReg2345_3456_hi; __m128i resReglo, resReghi; __m128i firstFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3); srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); // have consecutive loads on the same 256 register srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4); srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34); for (i = output_height; i > 1; i -= 2) { srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5); srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6); // merge every two consecutive registers srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56); srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo); // multiply 2 adjacent elements with the filter and add the result resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters); resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters); resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128()); resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128()); // shift by 6 bit each 16 bit resReglo = _mm_adds_epi16(resReglo, addFilterReg32); resReghi = _mm_adds_epi16(resReghi, addFilterReg32); resReglo = _mm_srai_epi16(resReglo, 6); resReghi = _mm_srai_epi16(resReghi, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReglo = _mm_packus_epi16(resReglo, resReglo); resReghi = _mm_packus_epi16(resReghi, resReghi); src_ptr += src_stride; *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReglo); *((uint32_t *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi); output_ptr += dst_stride; // save part of the registers for next strides srcReg23_34_lo = srcReg45_56_lo; srcReg4 = srcReg6; } }
int main(void) { int i, j, k = 0, len, set, hop; setvbuf(stdout, NULL, _IOLBF, 0); plan_tests(1 + nffstests + nshltests + 5 + 128); XMM ones = (__v2di) { -1, -1 }; ok(xm_same(xm_ones, ones), "xm_ones produces FFFF...FFFF"); for (i = 0; i < nffstests; ++i) { int act = findbit_1((uint8_t*)&ffstestv[i].inp, sizeof(XMM)); ok(act == ffstestv[i].exp, "test %d: act:%d exp:%d", i, act, ffstestv[i].exp); } for (i = 0; i < nshltests; ++i) { char acts[99]; XMM act = xm_shl(shltestv[0].exp, shltestv[i].nbits); xm_llx(act, acts); ok(0xFFFF == xm_same(act, shltestv[i].exp), "shl %d: %s", shltestv[i].nbits, acts); } XMM bitz = { 1,8 }; int iact = xm_ffs(bitz); ok(iact == 0, "xm_ffs(bitz) = %d", iact); iact = xm_fls(bitz); ok(iact == 67, "xm_fls(bitz) = %d", iact); bitz = _mm_setzero_si128(); iact = xm_ffs(bitz); ok(iact == -1, "xm_ffs(zero) = %d", iact); iact = xm_fls(bitz); ok(iact == -1, "xm_fls(zero) = %d", iact); XMM stuff = (__v2di) { 0x07BB01426C62272EULL, 0x6295C58D62B82175ULL }; char str[48]; xm_str(stuff, str); is_strncmp(str, "2E,27,62,6C,42,01,BB,07-75,21,B8,62,8D,C5,95,62", 48, "xm_str"); XMM one = { 1, 0 }, hibit = xm_shl_177(one); for (i = 0; i < 128; ++i) { int pos = xm_ffs(xm_or(xm_shl(one, i), hibit)); ok(pos == i, "xm_ffs(xm_shl(one,%d)) = %d", i, pos); } # define CMPSIZE 100000000L char *x = malloc(CMPSIZE), *y = malloc(CMPSIZE); memcpy(x, y, CMPSIZE); for (len = 20; len <= CMPSIZE; len *= 4) { for (i = 1; i < 17; i += i) { for (j = 1; j < 17; j += j) { for (set = 1; set < 2; set++) { char *a, *b; double t0 = tick(); for (hop = CMPSIZE / len, a = x, b = y; hop; --hop, a += len, b += len) a[i+len-17] ^= set, k += cmpxm(a+i, b+j, len-17), a[i+len-17] ^= set; double t1 = tick(); for (hop = CMPSIZE / len, a = x, b = y; hop; --hop, a += len, b += len) { _mm_prefetch(a+i+448, _MM_HINT_NTA); _mm_prefetch(b+j+448, _MM_HINT_NTA); a[i+len-17] ^= set, k += memcmp(a+i, b+j, len-17), a[i+len-17] ^= set; } double t2 = tick(); printf("%9d %2d %2d %d %4.1f\n", len, i, j, set, (t2 - t1)/(t1 - t0)); } } } } if (!k) puts(""); return exit_status(); }
static void aom_filter_block1d8_v4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; __m128i srcReg23, srcReg34, srcReg45, srcReg56; __m128i resReg23, resReg34, resReg45, resReg56; __m128i resReg23_45, resReg34_56; __m128i addFilterReg32, secondFilters, thirdFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the second 16 bits (third and forth byte) // across 128 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 128 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3); srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); // have consecutive loads on the same 256 register srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4); for (i = output_height; i > 1; i -= 2) { srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5); srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6); // multiply 2 adjacent elements with the filter and add the result resReg23 = _mm_maddubs_epi16(srcReg23, secondFilters); resReg34 = _mm_maddubs_epi16(srcReg34, secondFilters); resReg45 = _mm_maddubs_epi16(srcReg45, thirdFilters); resReg56 = _mm_maddubs_epi16(srcReg56, thirdFilters); // add and saturate the results together resReg23_45 = _mm_adds_epi16(resReg23, resReg45); resReg34_56 = _mm_adds_epi16(resReg34, resReg56); // shift by 6 bit each 16 bit resReg23_45 = _mm_adds_epi16(resReg23_45, addFilterReg32); resReg34_56 = _mm_adds_epi16(resReg34_56, addFilterReg32); resReg23_45 = _mm_srai_epi16(resReg23_45, 6); resReg34_56 = _mm_srai_epi16(resReg34_56, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReg23_45 = _mm_packus_epi16(resReg23_45, _mm_setzero_si128()); resReg34_56 = _mm_packus_epi16(resReg34_56, _mm_setzero_si128()); src_ptr += src_stride; _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45)); _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56)); output_ptr += dst_stride; // save part of the registers for next strides srcReg23 = srcReg45; srcReg34 = srcReg56; srcReg4 = srcReg6; } }
/* Function: p7_MSVFilter() * Synopsis: Calculates MSV score, vewy vewy fast, in limited precision. * Incept: SRE, Wed Dec 26 15:12:25 2007 [Janelia] * * Purpose: Calculates an approximation of the MSV score for sequence * <dsq> of length <L> residues, using optimized profile <om>, * and a preallocated one-row DP matrix <ox>. Return the * estimated MSV score (in nats) in <ret_sc>. * * Score may overflow (and will, on high-scoring * sequences), but will not underflow. * * The model may be in any mode, because only its match * emission scores will be used. The MSV filter inherently * assumes a multihit local mode, and uses its own special * state transition scores, not the scores in the profile. * * Args: dsq - digital target sequence, 1..L * L - length of dsq in residues * om - optimized profile * ox - DP matrix * ret_sc - RETURN: MSV score (in nats) * * Note: We misuse the matrix <ox> here, using only a third of the * first dp row, accessing it as <dp[0..Q-1]> rather than * in triplets via <{MDI}MX(q)> macros, since we only need * to store M state values. We know that if <ox> was big * enough for normal DP calculations, it must be big enough * to hold the MSVFilter calculation. * * Returns: <eslOK> on success. * <eslERANGE> if the score overflows the limited range; in * this case, this is a high-scoring hit. * * Throws: <eslEINVAL> if <ox> allocation is too small. */ int p7_MSVFilter(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *ret_sc) { register __m128i mpv; /* previous row values */ register __m128i xEv; /* E state: keeps max for Mk->E as we go */ register __m128i xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ register __m128i sv; /* temp storage of 1 curr row value in progress */ register __m128i biasv; /* emission bias in a vector */ uint8_t xJ; /* special states' scores */ int i; /* counter over sequence positions 1..L */ int q; /* counter over vectors 0..nq-1 */ int Q = p7O_NQB(om->M); /* segment length: # of vectors */ __m128i *dp = ox->dpb[0]; /* we're going to use dp[0][0..q..Q-1], not {MDI}MX(q) macros*/ __m128i *rsc; /* will point at om->rbv[x] for residue x[i] */ __m128i xJv; /* vector for states score */ __m128i tjbmv; /* vector for cost of moving from either J or N through B to an M state */ __m128i tecv; /* vector for E->C cost */ __m128i basev; /* offset for scores */ __m128i ceilingv; /* saturateed simd value used to test for overflow */ __m128i tempv; /* work vector */ int cmp; int status = eslOK; /* Check that the DP matrix is ok for us. */ if (Q > ox->allocQ16) ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small"); ox->M = om->M; /* Try highly optimized ssv filter first */ status = p7_SSVFilter(dsq, L, om, ret_sc); if (status != eslENORESULT) return status; /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base. */ biasv = _mm_set1_epi8((int8_t) om->bias_b); /* yes, you can set1() an unsigned char vector this way */ for (q = 0; q < Q; q++) dp[q] = _mm_setzero_si128(); xJ = 0; /* saturate simd register for overflow test */ ceilingv = _mm_cmpeq_epi8(biasv, biasv); basev = _mm_set1_epi8((int8_t) om->base_b); tjbmv = _mm_set1_epi8((int8_t) om->tjb_b + (int8_t) om->tbm_b); tecv = _mm_set1_epi8((int8_t) om->tec_b); xJv = _mm_subs_epu8(biasv, biasv); xBv = _mm_subs_epu8(basev, tjbmv); #if p7_DEBUGGING if (ox->debugging) { uint8_t xB; xB = _mm_extract_epi16(xBv, 0); xJ = _mm_extract_epi16(xJv, 0); p7_omx_DumpMFRow(ox, 0, 0, 0, xJ, xB, xJ); } #endif for (i = 1; i <= L; i++) { rsc = om->rbv[dsq[i]]; xEv = _mm_setzero_si128(); /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12. * Because ia32 is littlendian, this means a left bit shift. * Zeros shift on automatically, which is our -infinity. */ mpv = _mm_slli_si128(dp[Q-1], 1); for (q = 0; q < Q; q++) { /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */ sv = _mm_max_epu8(mpv, xBv); sv = _mm_adds_epu8(sv, biasv); sv = _mm_subs_epu8(sv, *rsc); rsc++; xEv = _mm_max_epu8(xEv, sv); mpv = dp[q]; /* Load {MDI}(i-1,q) into mpv */ dp[q] = sv; /* Do delayed store of M(i,q) now that memory is usable */ } /* test for the overflow condition */ tempv = _mm_adds_epu8(xEv, biasv); tempv = _mm_cmpeq_epi8(tempv, ceilingv); cmp = _mm_movemask_epi8(tempv); /* Now the "special" states, which start from Mk->E (->C, ->J->B) * Use shuffles instead of shifts so when the last max has completed, * the last four elements of the simd register will contain the * max value. Then the last shuffle will broadcast the max value * to all simd elements. */ tempv = _mm_shuffle_epi32(xEv, _MM_SHUFFLE(2, 3, 0, 1)); xEv = _mm_max_epu8(xEv, tempv); tempv = _mm_shuffle_epi32(xEv, _MM_SHUFFLE(0, 1, 2, 3)); xEv = _mm_max_epu8(xEv, tempv); tempv = _mm_shufflelo_epi16(xEv, _MM_SHUFFLE(2, 3, 0, 1)); xEv = _mm_max_epu8(xEv, tempv); tempv = _mm_srli_si128(xEv, 1); xEv = _mm_max_epu8(xEv, tempv); xEv = _mm_shuffle_epi32(xEv, _MM_SHUFFLE(0, 0, 0, 0)); /* immediately detect overflow */ if (cmp != 0x0000) { *ret_sc = eslINFINITY; return eslERANGE; } xEv = _mm_subs_epu8(xEv, tecv); xJv = _mm_max_epu8(xJv,xEv); xBv = _mm_max_epu8(basev, xJv); xBv = _mm_subs_epu8(xBv, tjbmv); #if p7_DEBUGGING if (ox->debugging) { uint8_t xB, xE; xB = _mm_extract_epi16(xBv, 0); xE = _mm_extract_epi16(xEv, 0); xJ = _mm_extract_epi16(xJv, 0); p7_omx_DumpMFRow(ox, i, xE, 0, xJ, xB, xJ); } #endif } /* end loop over sequence residues 1..L */ xJ = (uint8_t) _mm_extract_epi16(xJv, 0); /* finally C->T, and add our missing precision on the NN,CC,JJ back */ *ret_sc = ((float) (xJ - om->tjb_b) - (float) om->base_b); *ret_sc /= om->scale_b; *ret_sc -= 3.0; /* that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ */ return eslOK; }
int global_sse2_byte(int queryLength, unsigned char *profile, const unsigned char *dbSeq, int dbLength, unsigned short gapOpen, unsigned short gapExtend, unsigned short ceiling, unsigned short bias, struct f_struct *f_str) { int i, j; int score; int scale; int distance; int offset; int position; int dup; int cmp; int iter; __m128i *pvH; __m128i *pvE; __m128i vE, vF, vH; __m128i vHInit; __m128i vHNext; __m128i vFPrev; __m128i vBias; __m128i vGapOpen; __m128i vGapExtend; __m128i vCeiling; __m128i vScale; __m128i vScaleAmt; __m128i vScaleTmp; __m128i vTemp; __m128i vNull; __m128i *pvScore; scale = 0; iter = (queryLength + 15) / 16; offset = (queryLength - 1) % iter; position = 15 - (queryLength - 1) / iter; pvH = (__m128i *)f_str->workspace; pvE = pvH + iter; /* Load the bias to all elements of a constant */ dup = (bias << 8) | (bias & 0x00ff); vBias = _mm_setzero_si128(); /* initialize cf Apple Devel smith_waterman_sse2.c */ vBias = _mm_insert_epi16 (vBias, dup, 0); vBias = _mm_shufflelo_epi16 (vBias, 0); vBias = _mm_shuffle_epi32 (vBias, 0); /* Load gap opening penalty to all elements of a constant */ dup = (gapOpen << 8) | (gapOpen & 0x00ff); vGapOpen = _mm_setzero_si128(); /* initialize cf Apple Devel smith_waterman_sse2.c */ vGapOpen = _mm_insert_epi16 (vGapOpen, dup, 0); vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0); vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0); /* Load gap extension penalty to all elements of a constant */ dup = (gapExtend << 8) | (gapExtend & 0x00ff); vGapExtend = _mm_setzero_si128(); /* initialize cf Apple Devel smith_waterman_sse2.c */ vGapExtend = _mm_insert_epi16 (vGapExtend, dup, 0); vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0); vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0); /* Generate the ceiling before scaling */ dup = (ceiling << 8) | (ceiling & 0x00ff); vTemp = _mm_setzero_si128(); /* initialize cf Apple Devel smith_waterman_sse2.c */ vTemp = _mm_insert_epi16 (vTemp, dup, 0); vTemp = _mm_shufflelo_epi16 (vTemp, 0); vTemp = _mm_shuffle_epi32 (vTemp, 0); vCeiling = _mm_cmpeq_epi8 (vTemp, vTemp); vCeiling = _mm_subs_epu8 (vCeiling, vTemp); vCeiling = _mm_subs_epu8 (vCeiling, vGapOpen); /* since we want to use the full range, zero is redefined as */ /* 2 * gapOpen. the lowest scaled score will an insert followed */ /* by a delete. */ vHInit = _mm_srli_si128 (vGapOpen, 15); /* vNull = _mm_xor_si128 (vNull, vNull); */ vNull = _mm_setzero_si128(); /* initialize cf Apple Devel smith_waterman_sse2.c */ vScaleAmt = vNull; /* Zero out the storage vector */ for (i = 0; i < iter; i++) { _mm_store_si128 (pvH + i, vGapOpen); _mm_store_si128 (pvE + i, vNull); } /* initialize F */ vF = vNull; vFPrev = vNull; /* load and scale H for the next round */ vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_slli_si128 (vH, 1); vH = _mm_adds_epu8 (vH, vHInit); vH = _mm_adds_epu8 (vH, vHInit); for (i = 0; i < dbLength; ++i) { /* fetch first data asap. */ pvScore = (__m128i *) profile + dbSeq[i] * iter; vF = _mm_xor_si128 (vF, vF); vH = _mm_max_epu8 (vH, vFPrev); for (j = 0; j < iter; j++) { /* correct H from the previous columns F */ vHNext = _mm_load_si128 (pvH + j); vHNext = _mm_max_epu8 (vHNext, vFPrev); /* load and correct E value */ vE = _mm_load_si128 (pvE + j); vTemp = _mm_subs_epu8 (vHNext, vGapOpen); vE = _mm_max_epu8 (vE, vTemp); _mm_store_si128 (pvE + j, vE); /* add score to vH */ vH = _mm_adds_epu8 (vH, *pvScore++); vH = _mm_subs_epu8 (vH, vBias); /* get max from vH, vE and vF */ vH = _mm_max_epu8 (vH, vE); vH = _mm_max_epu8 (vH, vF); _mm_store_si128 (pvH + j, vH); /* update vF value */ vH = _mm_subs_epu8 (vH, vGapOpen); vF = _mm_max_epu8 (vF, vH); /* load the next h values */ vH = vHNext; } /* check if we need to scale before the next round */ vTemp = _mm_subs_epu8 (vCeiling, vF); vTemp = _mm_cmpeq_epi8 (vTemp, vNull); cmp = _mm_movemask_epi8 (vTemp); /* broadcast F values */ vTemp = _mm_slli_si128 (vF, 1); vTemp = _mm_subs_epu8 (vTemp, vScaleAmt); vF = _mm_max_epu8 (vF, vTemp); vScaleTmp = _mm_slli_si128 (vScaleAmt, 1); vScaleTmp = _mm_adds_epu8 (vScaleTmp, vScaleAmt); vTemp = _mm_slli_si128 (vF, 2); vTemp = _mm_subs_epu8 (vTemp, vScaleTmp); vF = _mm_max_epu8 (vF, vTemp); vTemp = _mm_slli_si128 (vScaleTmp, 2); vScaleTmp = _mm_adds_epu8 (vScaleTmp, vTemp); vTemp = _mm_slli_si128 (vF, 4); vTemp = _mm_subs_epu8 (vTemp, vScaleTmp); vF = _mm_max_epu8 (vF, vTemp); vTemp = _mm_slli_si128 (vScaleTmp, 4); vScaleTmp = _mm_adds_epu8 (vScaleTmp, vTemp); vTemp = _mm_slli_si128 (vF, 8); vTemp = _mm_subs_epu8 (vTemp, vScaleTmp); vF = _mm_max_epu8 (vF, vTemp); /* scale if necessary */ if (cmp != 0x0000) { vScale = _mm_slli_si128 (vF, 1); vScale = _mm_subs_epu8 (vScale, vGapOpen); vScale = _mm_subs_epu8 (vScale, vScaleAmt); vTemp = _mm_slli_si128 (vScale, 1); vTemp = _mm_subs_epu8 (vScale, vTemp); vScaleAmt = _mm_adds_epu8 (vScaleAmt, vTemp); vTemp = _mm_slli_si128 (vScale, 1); vTemp = _mm_subs_epu8 (vTemp, vScale); vScaleAmt = _mm_subs_epu8 (vScaleAmt, vTemp); /* rescale the previous F */ vF = _mm_subs_epu8 (vF, vScale); /* check if we can continue in 8-bits */ vTemp = _mm_subs_epu8 (vCeiling, vF); vTemp = _mm_cmpeq_epi8 (vTemp, vNull); cmp = _mm_movemask_epi8 (vTemp); if (cmp != 0x0000) { return OVERFLOW_SCORE; } /* scale all the vectors */ for (j = 0; j < iter; j++) { /* load H and E */ vH = _mm_load_si128 (pvH + j); vE = _mm_load_si128 (pvE + j); /* get max from vH, vE and vF */ vH = _mm_subs_epu8 (vH, vScale); vE = _mm_subs_epu8 (vE, vScale); /* save the H and E */ _mm_store_si128 (pvH + j, vH); _mm_store_si128 (pvE + j, vE); } /* calculate the final scaling amount */ vScale = vScaleAmt; for (j = 0; j < position; ++j) { vScale = _mm_slli_si128 (vScale, 1); } vTemp = _mm_unpacklo_epi8 (vScale, vNull); vScale = _mm_unpackhi_epi8 (vScale, vNull); vScale = _mm_adds_epi16 (vScale, vTemp); vTemp = _mm_srli_si128 (vScale, 8); vScale = _mm_adds_epi16 (vScale, vTemp); vTemp = _mm_srli_si128 (vScale, 4); vScale = _mm_adds_epi16 (vScale, vTemp); vTemp = _mm_srli_si128 (vScale, 2); vScale = _mm_adds_epi16 (vScale, vTemp); scale = (int) _mm_extract_epi16 (vScale, 0); } /* scale the F value for the next round */ vFPrev = _mm_slli_si128 (vF, 1); vFPrev = _mm_subs_epu8 (vFPrev, vScaleAmt); /* load and scale H for the next round */ vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_slli_si128 (vH, 1); vH = _mm_subs_epu8 (vH, vScaleAmt); vH = _mm_or_si128 (vH, vHInit); } /* calculate the max global score */ vH = _mm_load_si128 (pvH + offset); vH = _mm_max_epu8 (vH, vF); for (j = 0; j < position; ++j) { vH = _mm_slli_si128 (vH, 1); } score = (int) (unsigned short) _mm_extract_epi16 (vH, 7); score >>= 8; /* return largest score */ distance = (queryLength + dbLength) * gapExtend; score = score - (gapOpen * 2) - distance + scale; return score; }
/* *********************************************************** */ mlib_status FUN_NAME( 1ch) ( mlib_affine_param *param) { DECLAREVAR_BL(); DTYPE *dstLineEnd; DTYPE *srcPixelPtr2; #if MLIB_SHIFT == 15 dX = (dX + 1) >> 1; dY = (dY + 1) >> 1; #endif /* MLIB_SHIFT == 15 */ __m128i zeros = _mm_setzero_si128(); __m128i const_ffff = _mm_set1_epi16(0xffff); __m128i rounds = _mm_set1_epi16(MLIB_ROUND_SSE2); for (j = yStart; j <= yFinish; j++) { __m128i fdxs, fdys, deltax, deltay; __m128i fdx2s, fdy2s; __m128i a00_0s, a01_0s, a10_0s, a11_0s; __m128i pix0_0s, pix0_1s, pix1_0s, pix1_1s, res0s; mlib_s32 fdx, fdy; mlib_s32 a00_0, a01_0, a10_0, a11_0; mlib_s32 pix0_0, pix1_0, res0; mlib_u8 *srcPtr_0, *srcPtr_1, *srcPtr_2, *srcPtr_3; mlib_u8 *srcPtr_4, *srcPtr_5, *srcPtr_6, *srcPtr_7;
int global_sse2_word(int queryLength, unsigned short *profile, const unsigned char *dbSeq, int dbLength, unsigned short gapOpen, unsigned short gapExtend, unsigned short ceiling, struct f_struct *f_str) { int i, j; int score; int scale; int temp; int distance; int offset; int position; int cmp; int iter; __m128i *pvH; __m128i *pvE; __m128i vE, vF, vH; __m128i vHNext; __m128i vFPrev; __m128i vGapOpen; __m128i vGapExtend; __m128i vCeiling; __m128i vScale; __m128i vScaleAmt; __m128i vScaleTmp; __m128i vTemp; __m128i vNull; __m128i *pvScore; scale = 0; iter = (queryLength + 7) / 8; offset = (queryLength - 1) % iter; position = 7 - (queryLength - 1) / iter; pvH = (__m128i *)f_str->workspace; pvE = pvH + iter; /* Load gap opening penalty to all elements of a constant */ vGapOpen = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0); vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0); vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0); /* Load gap extension penalty to all elements of a constant */ vGapExtend = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0); vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0); vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0); /* Generate the ceiling before scaling */ vTemp = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vTemp = _mm_insert_epi16 (vTemp, ceiling, 0); vTemp = _mm_shufflelo_epi16 (vTemp, 0); vTemp = _mm_shuffle_epi32 (vTemp, 0); vCeiling = _mm_cmpeq_epi16 (vTemp, vTemp); vCeiling = _mm_srli_epi16 (vCeiling, 1); vCeiling = _mm_subs_epi16 (vCeiling, vTemp); vCeiling = _mm_subs_epi16 (vCeiling, vGapOpen); vNull = _mm_cmpeq_epi16 (vTemp, vTemp); vNull = _mm_slli_epi16 (vNull, 15); vScaleAmt = _mm_xor_si128 (vNull, vNull); /* Zero out the storage vector */ vTemp = _mm_adds_epi16 (vNull, vGapOpen); for (i = 0; i < iter; i++) { _mm_store_si128 (pvH + i, vTemp); _mm_store_si128 (pvE + i, vNull); } /* initialize F */ vF = vNull; vFPrev = vNull; /* load and scale H for the next round */ vTemp = _mm_srli_si128 (vGapOpen, 14); vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_adds_epi16 (vH, vTemp); for (i = 0; i < dbLength; ++i) { /* fetch first data asap. */ pvScore = (__m128i *) profile + dbSeq[i] * iter; vF = vNull; vH = _mm_max_epi16 (vH, vFPrev); for (j = 0; j < iter; j++) { /* correct H from the previous columns F */ vHNext = _mm_load_si128 (pvH + j); vHNext = _mm_max_epi16 (vHNext, vFPrev); /* load and correct E value */ vE = _mm_load_si128 (pvE + j); vTemp = _mm_subs_epi16 (vHNext, vGapOpen); vE = _mm_max_epi16 (vE, vTemp); _mm_store_si128 (pvE + j, vE); /* add score to vH */ vH = _mm_adds_epi16 (vH, *pvScore++); /* get max from vH, vE and vF */ vH = _mm_max_epi16 (vH, vE); vH = _mm_max_epi16 (vH, vF); _mm_store_si128 (pvH + j, vH); /* update vF value */ vH = _mm_subs_epi16 (vH, vGapOpen); vF = _mm_max_epi16 (vF, vH); /* load the next h values */ vH = vHNext; } /* check if we need to scale before the next round */ vTemp = _mm_cmpgt_epi16 (vF, vCeiling); cmp = _mm_movemask_epi8 (vTemp); /* broadcast F values */ vF = _mm_xor_si128 (vF, vNull); vTemp = _mm_slli_si128 (vF, 2); vTemp = _mm_subs_epu16 (vTemp, vScaleAmt); vF = max_epu16 (vF, vTemp); vTemp = _mm_slli_si128 (vF, 4); vScaleTmp = _mm_slli_si128 (vScaleAmt, 2); vScaleTmp = _mm_adds_epu16 (vScaleTmp, vScaleAmt); vTemp = _mm_subs_epu16 (vTemp, vScaleTmp); vF = max_epu16 (vF, vTemp); vTemp = _mm_slli_si128 (vScaleTmp, 4); vScaleTmp = _mm_adds_epu16 (vScaleTmp, vTemp); vTemp = _mm_slli_si128 (vF, 8); vTemp = _mm_subs_epu16 (vTemp, vScaleTmp); vF = max_epu16 (vF, vTemp); /* scale if necessary */ if (cmp != 0x0000) { __m128i vScale1; __m128i vScale2; vScale = _mm_slli_si128 (vF, 2); vScale = _mm_subs_epu16 (vScale, vGapOpen); vScale = _mm_subs_epu16 (vScale, vScaleAmt); vTemp = _mm_slli_si128 (vScale, 2); vTemp = _mm_subs_epu16 (vScale, vTemp); vScaleAmt = _mm_adds_epu16 (vScaleAmt, vTemp); vTemp = _mm_slli_si128 (vScale, 2); vTemp = _mm_subs_epu16 (vTemp, vScale); vScaleAmt = _mm_subs_epu16 (vScaleAmt, vTemp); /* rescale the previous F */ vF = _mm_subs_epu16 (vF, vScale); /* check if we can continue in signed 16-bits */ vTemp = _mm_xor_si128 (vF, vNull); vTemp = _mm_cmpgt_epi16 (vTemp, vCeiling); cmp = _mm_movemask_epi8 (vTemp); if (cmp != 0x0000) { return OVERFLOW_SCORE; } vTemp = _mm_adds_epi16 (vCeiling, vCeiling); vScale1 = _mm_subs_epu16 (vScale, vTemp); vScale2 = _mm_subs_epu16 (vScale, vScale1); /* scale all the vectors */ for (j = 0; j < iter; j++) { /* load H and E */ vH = _mm_load_si128 (pvH + j); vE = _mm_load_si128 (pvE + j); /* get max from vH, vE and vF */ vH = _mm_subs_epi16 (vH, vScale1); vH = _mm_subs_epi16 (vH, vScale2); vE = _mm_subs_epi16 (vE, vScale1); vE = _mm_subs_epi16 (vE, vScale2); /* save the H and E */ _mm_store_si128 (pvH + j, vH); _mm_store_si128 (pvE + j, vE); } vScale = vScaleAmt; for (j = 0; j < position; ++j) { vScale = _mm_slli_si128 (vScale, 2); } /* calculate the final scaling amount */ vTemp = _mm_xor_si128 (vTemp, vTemp); vScale1 = _mm_unpacklo_epi16 (vScale, vTemp); vScale2 = _mm_unpackhi_epi16 (vScale, vTemp); vScale = _mm_add_epi32 (vScale1, vScale2); vTemp = _mm_srli_si128 (vScale, 8); vScale = _mm_add_epi32 (vScale, vTemp); vTemp = _mm_srli_si128 (vScale, 4); vScale = _mm_add_epi32 (vScale, vTemp); scale = (int) (unsigned short) _mm_extract_epi16 (vScale, 0); temp = (int) (unsigned short) _mm_extract_epi16 (vScale, 1); scale = scale + (temp << 16); } /* scale the F value for the next round */ vFPrev = _mm_slli_si128 (vF, 2); vFPrev = _mm_subs_epu16 (vFPrev, vScaleAmt); vFPrev = _mm_xor_si128 (vFPrev, vNull); /* load and scale H for the next round */ vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_xor_si128 (vH, vNull); vH = _mm_slli_si128 (vH, 2); vH = _mm_subs_epu16 (vH, vScaleAmt); vH = _mm_insert_epi16 (vH, gapOpen, 0); vH = _mm_xor_si128 (vH, vNull); } vH = _mm_load_si128 (pvH + offset); vH = _mm_max_epi16 (vH, vFPrev); for (j = 0; j < position; ++j) { vH = _mm_slli_si128 (vH, 2); } score = (int) (signed short) _mm_extract_epi16 (vH, 7); score = score + SHORT_BIAS; /* return largest score */ distance = (queryLength + dbLength) * gapExtend; score = score - (gapOpen * 2) - distance + scale; return score; }
void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t* zbin_ptr, const int16_t* round_ptr, const int16_t* quant_ptr, const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, uint16_t* eob_ptr, const int16_t* scan_ptr, const int16_t* iscan_ptr) { __m128i zero; (void)scan_ptr; (void)zbin_ptr; (void)quant_shift_ptr; coeff_ptr += n_coeffs; iscan_ptr += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; zero = _mm_setzero_si128(); if (!skip_block) { __m128i eob; __m128i round, quant, dequant; { __m128i coeff0, coeff1; // Setup global values { round = _mm_load_si128((const __m128i*)round_ptr); quant = _mm_load_si128((const __m128i*)quant_ptr); dequant = _mm_load_si128((const __m128i*)dequant_ptr); } { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; // Do DC and first 15 AC coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); qcoeff0 = _mm_adds_epi16(qcoeff0, round); round = _mm_unpackhi_epi64(round, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); quant = _mm_unpackhi_epi64(quant, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); // Reinsert signs qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); } { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob = _mm_max_epi16(eob, eob1); } n_coeffs += 8 * 2; } // AC only loop while (n_coeffs < 0) { __m128i coeff0, coeff1; { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); qcoeff0 = _mm_adds_epi16(qcoeff0, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); // Reinsert signs qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); } { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob0, eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob0 = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob0 = _mm_max_epi16(eob0, eob1); eob = _mm_max_epi16(eob, eob0); } n_coeffs += 8 * 2; } // Accumulate EOB { __m128i eob_shuffled; eob_shuffled = _mm_shuffle_epi32(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); eob = _mm_max_epi16(eob, eob_shuffled); *eob_ptr = _mm_extract_epi16(eob, 1); } } else { do { _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; } }
mlib_status __mlib_VectorConvert_S32_S16_Mod( mlib_s32 *z, const mlib_s16 *x, mlib_s32 n) { if (n < 1) return (MLIB_FAILURE); mlib_s32 i, ax, az, nstep, n1, n2, n3; mlib_s16 *px = (mlib_s16 *)x; mlib_s32 *pz = (mlib_s32 *)z; __m128i xbuf, zlo, zhi, zero; zero = _mm_setzero_si128(); ax = (mlib_addr)x & 15; az = (mlib_addr)z & 15; nstep = 16 / sizeof (mlib_s16); n1 = ((16 - ax) & 15) / sizeof (mlib_s16); n2 = (n - n1) / nstep; n3 = n - n1 - n2 * nstep; if (n2 < 1) { for (i = 0; i < n; i++) { *pz++ = *px++; } } else { for (i = 0; i < n1; i++) { *pz++ = *px++; } if ((ax * 2 & 15) == az) { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); zlo = _mm_unpacklo_epi16(zero, xbuf); zhi = _mm_unpackhi_epi16(zero, xbuf); zlo = _mm_srai_epi32(zlo, 16); zhi = _mm_srai_epi32(zhi, 16); _mm_store_si128((__m128i *)pz, zlo); _mm_store_si128((__m128i *)pz + 1, zhi); px += nstep; pz += nstep; } } else { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); zlo = _mm_unpacklo_epi16(zero, xbuf); zhi = _mm_unpackhi_epi16(zero, xbuf); zlo = _mm_srai_epi32(zlo, 16); zhi = _mm_srai_epi32(zhi, 16); _mm_storeu_si128((__m128i *)pz, zlo); _mm_storeu_si128((__m128i *)pz + 1, zhi); px += nstep; pz += nstep; } } for (i = 0; i < n3; i++) { *pz++ = *px++; } } return (MLIB_SUCCESS); }
void minmax_vec(const uint32_t n, float const* buf, uint32_t* idx_min_, uint32_t* idx_max_, float* min_, float* max_) { // We suppose that pointers are aligned on an 16-byte boundary // Initialise SSE registers __m128i sse_idx_min = _mm_setzero_si128(); __m128i sse_idx_max = _mm_setzero_si128(); __m128 sse_min = _mm_set1_ps(FLT_MAX); __m128 sse_max = _mm_set1_ps(FLT_MIN); // We will unroll the for-loop by for, thus doing // (n/4) iterations. const uint32_t n_sse = n & ~3ULL; __m128i sse_idx = _mm_set_epi32(3, 2, 1, 0); const __m128i sse_4 = _mm_set1_epi32(4); for (uint32_t i = 0; i < n_sse; i += 4) { const __m128 sse_v = _mm_load_ps(&buf[i]); const __m128 sse_cmp_min = _mm_cmplt_ps(sse_v, sse_min); const __m128 sse_cmp_max = _mm_cmpgt_ps(sse_v, sse_max); sse_min = _mm_blendv_ps(sse_min, sse_v, sse_cmp_min); sse_max = _mm_blendv_ps(sse_max, sse_v, sse_cmp_max); sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx, (__m128) sse_cmp_min); sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx, (__m128) sse_cmp_max); sse_idx = _mm_add_epi32(sse_idx, sse_4); } // SSE reduction float __attribute__((aligned(16))) mins[4]; float __attribute__((aligned(16))) maxs[4]; _mm_store_ps(mins, sse_min); _mm_store_ps(maxs, sse_max); float min = mins[0]; float max = maxs[0]; uint32_t idx_min = _mm_extract_epi32(sse_idx_min, 0); uint32_t idx_max = _mm_extract_epi32(sse_idx_max, 0); // Unrolled by GCC for (int i = 1; i < 4; i++) { float v = mins[i]; if (v < min) { min = v; idx_min = _mm_extract_epi32(sse_idx_min, i); } v = maxs[i]; if (v > max) { max = v; idx_max = _mm_extract_epi32(sse_idx_max, i); } } // Epilogue for (uint32_t i = n_sse; i < n; i++) { const float v = buf[i]; if (v < min) { min = v; idx_min = i; } if (v > max) { max = v; idx_max = i; } } *idx_min_ = idx_min; *min_ = min; *idx_max_ = idx_max; *max_ = max; }
#ifndef _MAX /* avoid collision with common (nonconforming) macros */ #define _MAX (std::max) #endif #define MAX_DIMENSION 4000 // Maximum width or height supported #define SUBPIXEL_MULTIPLIER 8 // Statics constants for use by alpha_blend_sse2 static __m128i low_mask = _mm_set1_epi16(0xFF); static __m128i red_mask = _mm_set1_epi32(0xFF); static __m128i green_mask = _mm_set1_epi32(0xFF00); static __m128i blue_mask = _mm_set1_epi32(0xFF0000); static __m128i alpha_bit_mask = _mm_set1_epi32(0xFF000000); static __m128i one = _mm_set1_epi16(1); static __m128i inv_one = _mm_set1_epi16(0x100); static __m128i zero = _mm_setzero_si128(); int Rasterizer::getOverlayWidth() { return mOverlayWidth * 8; } Rasterizer::Rasterizer() : mpPathTypes(nullptr) , mpPathPoints(nullptr) , mPathPoints(0) , mpOverlayBuffer(nullptr) , mOverlayWidth(0) , mOverlayHeight(0) , mPathOffsetX(0) , mPathOffsetY(0)
void cv::FAST(const Mat& img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression) { const int K = 8, N = 16 + K + 1; int i, j, k, pixel[N]; makeOffsets(pixel, img.step); for(k = 16; k < N; k++) pixel[k] = pixel[k - 16]; keypoints.clear(); threshold = std::min(std::max(threshold, 0), 255); #if CV_SSE2 __m128i delta = _mm_set1_epi8(128), t = _mm_set1_epi8(threshold), K16 = _mm_set1_epi8(K); #endif uchar threshold_tab[512]; for( i = -255; i <= 255; i++ ) threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0); AutoBuffer<uchar> _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128); uchar* buf[3]; buf[0] = _buf; buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols; int* cpbuf[3]; cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1; cpbuf[1] = cpbuf[0] + img.cols + 1; cpbuf[2] = cpbuf[1] + img.cols + 1; memset(buf[0], 0, img.cols*3); for(i = 3; i < img.rows-2; i++) { const uchar* ptr = img.ptr<uchar>(i) + 3; uchar* curr = buf[(i - 3)%3]; int* cornerpos = cpbuf[(i - 3)%3]; memset(curr, 0, img.cols); int ncorners = 0; if( i < img.rows - 3 ) { j = 3; #if CV_SSE2 for(; j < img.cols - 16 - 3; j += 16, ptr += 16) { __m128i m0, m1; __m128i v0 = _mm_loadu_si128((const __m128i*)ptr); __m128i v1 = _mm_xor_si128(_mm_subs_epu8(v0, t), delta); v0 = _mm_xor_si128(_mm_adds_epu8(v0, t), delta); __m128i x0 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[0])), delta); __m128i x1 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[4])), delta); __m128i x2 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[8])), delta); __m128i x3 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[12])), delta); m0 = _mm_and_si128(_mm_cmpgt_epi8(x0, v0), _mm_cmpgt_epi8(x1, v0)); m1 = _mm_and_si128(_mm_cmpgt_epi8(v1, x0), _mm_cmpgt_epi8(v1, x1)); m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x1, v0), _mm_cmpgt_epi8(x2, v0))); m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x1), _mm_cmpgt_epi8(v1, x2))); m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x2, v0), _mm_cmpgt_epi8(x3, v0))); m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x2), _mm_cmpgt_epi8(v1, x3))); m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x3, v0), _mm_cmpgt_epi8(x0, v0))); m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x3), _mm_cmpgt_epi8(v1, x0))); m0 = _mm_or_si128(m0, m1); int mask = _mm_movemask_epi8(m0); if( mask == 0 ) continue; if( (mask & 255) == 0 ) { j -= 8; ptr -= 8; continue; } __m128i c0 = _mm_setzero_si128(), c1 = c0, max0 = c0, max1 = c0; for( k = 0; k < N; k++ ) { __m128i x = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(ptr + pixel[k])), delta); m0 = _mm_cmpgt_epi8(x, v0); m1 = _mm_cmpgt_epi8(v1, x); c0 = _mm_and_si128(_mm_sub_epi8(c0, m0), m0); c1 = _mm_and_si128(_mm_sub_epi8(c1, m1), m1); max0 = _mm_max_epu8(max0, c0); max1 = _mm_max_epu8(max1, c1); } max0 = _mm_max_epu8(max0, max1); int m = _mm_movemask_epi8(_mm_cmpgt_epi8(max0, K16)); for( k = 0; m > 0 && k < 16; k++, m >>= 1 ) if(m & 1) { cornerpos[ncorners++] = j+k; if(nonmax_suppression) curr[j+k] = cornerScore(ptr+k, pixel, threshold); } } #endif for( ; j < img.cols - 3; j++, ptr++ ) { int v = ptr[0]; const uchar* tab = &threshold_tab[0] - v + 255; int d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]]; if( d == 0 ) continue; d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]]; d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]]; d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]]; if( d == 0 ) continue; d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]]; d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]]; d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]]; d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]]; if( d & 1 ) { int vt = v - threshold, count = 0; for( k = 0; k < N; k++ ) { int x = ptr[pixel[k]]; if(x < vt) { if( ++count > K ) { cornerpos[ncorners++] = j; if(nonmax_suppression) curr[j] = cornerScore(ptr, pixel, threshold); break; } } else count = 0; } } if( d & 2 ) { int vt = v + threshold, count = 0; for( k = 0; k < N; k++ ) { int x = ptr[pixel[k]]; if(x > vt) { if( ++count > K ) { cornerpos[ncorners++] = j; if(nonmax_suppression) curr[j] = cornerScore(ptr, pixel, threshold); break; } } else count = 0; } } } } cornerpos[-1] = ncorners; if( i == 3 ) continue; const uchar* prev = buf[(i - 4 + 3)%3]; const uchar* pprev = buf[(i - 5 + 3)%3]; cornerpos = cpbuf[(i - 4 + 3)%3]; ncorners = cornerpos[-1]; for( k = 0; k < ncorners; k++ ) { j = cornerpos[k]; int score = prev[j]; if( !nonmax_suppression || (score > prev[j+1] && score > prev[j-1] && score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] && score > curr[j-1] && score > curr[j] && score > curr[j+1]) ) { keypoints.push_back(KeyPoint((float)j, (float)(i-1), 7.f, -1, (float)score)); } } }
void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row, png_const_bytep prev) { /* Paeth tries to predict pixel d using the pixel to the left of it, a, * and two pixels from the previous row, b and c: * prev: c b * row: a d * The Paeth function predicts d to be whichever of a, b, or c is nearest to * p=a+b-c. * * The first pixel has no left context, and so uses an Up filter, p = b. * This works naturally with our main loop's p = a+b-c if we force a and c * to zero. * Here we zero b and d, which become c and a respectively at the start of * the loop. */ png_size_t rb; const __m128i zero = _mm_setzero_si128(); __m128i pa,pb,pc,smallest,nearest; __m128i c, b = zero, a, d = zero; png_debug(1, "in png_read_filter_row_paeth4_sse2"); rb = row_info->rowbytes+4; while (rb > 4) { /* It's easiest to do this math (particularly, deal with pc) with 16-bit * intermediates. */ c = b; b = _mm_unpacklo_epi8(load4(prev), zero); a = d; d = _mm_unpacklo_epi8(load4(row ), zero); /* (p-a) == (a+b-c - a) == (b-c) */ pa = _mm_sub_epi16(b,c); /* (p-b) == (a+b-c - b) == (a-c) */ pb = _mm_sub_epi16(a,c); /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ pc = _mm_add_epi16(pa,pb); pa = abs_i16(pa); /* |p-a| */ pb = abs_i16(pb); /* |p-b| */ pc = abs_i16(pc); /* |p-c| */ smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); /* Paeth breaks ties favoring a over b over c. */ nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c)); /* Note `_epi8`: we need addition to wrap modulo 255. */ d = _mm_add_epi8(d, nearest); store4(row, _mm_packus_epi16(d,d)); prev += 4; row += 4; rb -= 4; } }
mlib_status __mlib_ImageBlend_OMSC_SAS( mlib_image *dst, const mlib_image *src1, const mlib_image *src2, mlib_s32 cmask) { mlib_s32 src_alpha, dst_alpha; mlib_s32 min; BLEND_VALIDATE; if (channels == 3) return (__mlib_ImageBlend_OMSC_ZERO(dst, src1, src2, cmask)); mlib_s32 d_s0, d_s1, d_s2, d_s3; int k; __m128i *px, *py, *pz; __m128i dx, dy; /* upper - 1 lower - 0 */ __m128i dx_1, dx_0, dy_1, dy_0, dz_1, dz_0; __m128i dall_zero; __m128i df_f = _mm_set1_epi32(0x00ff00ff); __m128i done_one = _mm_set1_epi32(0x00010001); dall_zero = _mm_setzero_si128(); if (cmask == 8) { if (0 == (((((mlib_addr) psrc1 | (mlib_addr)psrc2 | (mlib_addr)pdst)) & 0xf)) && (0 == (((src1_stride | src2_stride | dst_stride) & 0xf) || (1 == dst_height)))) { for (j = 0; j < dst_height; j++) { px = (__m128i *)psrc1; py = (__m128i *)psrc2; pz = (__m128i *)pdst; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= dst_width - 4; i += 4) { dx = _mm_load_si128(px); dy = _mm_load_si128(py); UNPACK_UNSIGN_BYTE; PROCESS_DATA_8(dx_1, dy_1, dz_1); PROCESS_DATA_8(dx_0, dy_0, dz_0); dz_0 = _mm_packus_epi16(dz_0, dz_1); _mm_store_si128(pz, dz_0); px++; py++; pz++; } #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ DO_REST_8; psrc1 += src1_stride; psrc2 += src2_stride; pdst += dst_stride; } } else { for (j = 0; j < dst_height; j++) { px = (__m128i *)psrc1; py = (__m128i *)psrc2; pz = (__m128i *)pdst; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= dst_width - 4; i += 4) { dx = _mm_loadu_si128(px); dy = _mm_loadu_si128(py); UNPACK_UNSIGN_BYTE; PROCESS_DATA_8(dx_1, dy_1, dz_1); PROCESS_DATA_8(dx_0, dy_0, dz_0); dz_0 = _mm_packus_epi16(dz_0, dz_1); _mm_storeu_si128(pz, dz_0); px++; py++; pz++; } #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ DO_REST_8; psrc1 += src1_stride; psrc2 += src2_stride; pdst += dst_stride; } } } else { if (0 == (((((mlib_addr) psrc1 | (mlib_addr)psrc2 | (mlib_addr)pdst)) & 0xf)) && (0 == (((src1_stride | src2_stride | dst_stride) & 0xf) || (1 == dst_height)))) { for (j = 0; j < dst_height; j++) { px = (__m128i *)psrc1; py = (__m128i *)psrc2; pz = (__m128i *)pdst; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= dst_width - 4; i += 4) { dx = _mm_load_si128(px); dy = _mm_load_si128(py); UNPACK_UNSIGN_BYTE; PROCESS_DATA_1(dx_1, dy_1, dz_1); PROCESS_DATA_1(dx_0, dy_0, dz_0); dz_0 = _mm_packus_epi16(dz_0, dz_1); _mm_store_si128(pz, dz_0); px++; py++; pz++; } #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ DO_REST_1; psrc1 += src1_stride; psrc2 += src2_stride; pdst += dst_stride; } } else { for (j = 0; j < dst_height; j++) { px = (__m128i *)psrc1; py = (__m128i *)psrc2; pz = (__m128i *)pdst; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= dst_width - 4; i += 4) { dx = _mm_loadu_si128(px); dy = _mm_loadu_si128(py); UNPACK_UNSIGN_BYTE; PROCESS_DATA_1(dx_1, dy_1, dz_1); PROCESS_DATA_1(dx_0, dy_0, dz_0); dz_0 = _mm_packus_epi16(dz_0, dz_1); _mm_storeu_si128(pz, dz_0); px++; py++; pz++; } #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ DO_REST_1; psrc1 += src1_stride; psrc2 += src2_stride; pdst += dst_stride; } } } return (MLIB_SUCCESS); }
void lp_rast_triangle_3_4(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { const struct lp_rast_triangle *tri = arg.triangle.tri; const struct lp_rast_plane *plane = GET_PLANES(tri); unsigned x = (arg.triangle.plane_mask & 0xff) + task->x; unsigned y = (arg.triangle.plane_mask >> 8) + task->y; __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */ __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */ __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */ __m128i zero = _mm_setzero_si128(); __m128i c; __m128i dcdx; __m128i dcdy; __m128i dcdx2; __m128i dcdx3; __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ __m128i unused; transpose4_epi32(&p0, &p1, &p2, &zero, &c, &dcdx, &dcdy, &unused); /* Adjust dcdx; */ dcdx = _mm_sub_epi32(zero, dcdx); c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ c = _mm_sub_epi32(c, _mm_set1_epi32(1)); dcdx2 = _mm_add_epi32(dcdx, dcdx); dcdx3 = _mm_add_epi32(dcdx2, dcdx); transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, &span_0, &span_1, &span_2, &unused); { __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0); __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1); __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2); __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); __m128i c_01 = _mm_packs_epi32(c_0, c_1); __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); __m128i c_23 = _mm_packs_epi32(c_2, c_3); __m128i c_0123 = _mm_packs_epi16(c_01, c_23); unsigned mask = _mm_movemask_epi8(c_0123); if (mask != 0xffff) lp_rast_shade_quads_mask(task, &tri->inputs, x, y, 0xffff & ~mask); } }