static void TEST (void) { union { __m128i x[NUM / 2]; long long ll[NUM]; } dst; union { __m128i x[NUM / 2]; int i[NUM * 2]; } src1, src2; int i, sign = 1; long long value; for (i = 0; i < NUM * 2; i += 2) { src1.i[i] = i * i * sign; src2.i[i] = (i + 20) * sign; sign = -sign; } for (i = 0; i < NUM; i += 2) dst.x[i / 2] = _mm_mul_epi32 (src1.x[i / 2], src2.x[i / 2]); for (i = 0; i < NUM; i++) { value = (long long) src1.i[i * 2] * (long long) src2.i[i * 2]; if (value != dst.ll[i]) abort (); } }
int oneThread(int threadId) { int *aa; int *bb; int k; int itr; aa = (int *)_mm_malloc(sizeof(int)*ARRAY_SIZE, 16); bb = (int *)_mm_malloc(sizeof(int)*ARRAY_SIZE, 16); memset(&aa[0], 1, ARRAY_SIZE*4); memset(&bb[0], 2, ARRAY_SIZE*4); __m128i a0,a1,a2,a3,b0,b1,b2,b3; __m128i a4,a5,a6,a7,b4,b5,b6,b7; __m128i c0,c1,c2,c3; __m128i c4,c5,c6,c7; __m128i cc; cc = _mm_set_epi32 (0, 0, 0, 0); for (k = 0; k < REPS; k++) { for (itr = 0; itr<ARRAY_SIZE; itr+=32) { a0 = _mm_load_si128((__m128i*)&aa[itr]); a1 = _mm_load_si128((__m128i*)&aa[itr+4]); a2 = _mm_load_si128((__m128i*)&aa[itr+8]); a3 = _mm_load_si128((__m128i*)&aa[itr+12]); a4 = _mm_load_si128((__m128i*)&aa[itr+16]); a5 = _mm_load_si128((__m128i*)&aa[itr+20]); a6 = _mm_load_si128((__m128i*)&aa[itr+24]); a7 = _mm_load_si128((__m128i*)&aa[itr+28]); b0 = _mm_load_si128((__m128i*)&bb[itr]); b1 = _mm_load_si128((__m128i*)&bb[itr+4]); b2 = _mm_load_si128((__m128i*)&bb[itr+8]); b3 = _mm_load_si128((__m128i*)&bb[itr+12]); b4 = _mm_load_si128((__m128i*)&bb[itr+16]); b5 = _mm_load_si128((__m128i*)&bb[itr+20]); b6 = _mm_load_si128((__m128i*)&bb[itr+24]); b7 = _mm_load_si128((__m128i*)&bb[itr+28]); c0 = _mm_mul_epi32(a0, b0); c1 = _mm_mul_epi32(a1, b1); c2 = _mm_mul_epi32(a2, b2); c3 = _mm_mul_epi32(a3, b3); c4 = _mm_mul_epi32(a4, b4); c5 = _mm_mul_epi32(a5, b5); c6 = _mm_mul_epi32(a6, b6); c7 = _mm_mul_epi32(a7, b7); c0 = _mm_add_epi32(c0,c1); c1 = _mm_add_epi32(c2,c3); c2 = _mm_add_epi32(c4,c5); c3 = _mm_add_epi32(c6,c7); c0 = _mm_add_epi32(c0,c1); c1 = _mm_add_epi32(c2,c3); c0 = _mm_add_epi32(c0,c1); cc = _mm_add_epi32(cc,c0); } } cc = _mm_hadd_epi32(cc,cc); cc = _mm_hadd_epi32(cc,cc); int count =0; count = _mm_cvtsi128_si32(cc) ; free(aa); free(bb); return count; }
/* Compute reflection coefficients from input signal */ void silk_burg_modified_sse4_1( opus_int32 *res_nrg, /* O Residual energy */ opus_int *res_nrg_Q, /* O Residual energy Q value */ opus_int32 A_Q16[], /* O Prediction coefficients (length order) */ const opus_int16 x[], /* I Input signal, length: nb_subfr * (D + subfr_length) */ const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */ const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */ const opus_int nb_subfr, /* I Number of subframes stacked in x */ const opus_int D, /* I Order */ int arch /* I Run-time architecture */ ) { opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain; opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2; const opus_int16 *x_ptr; opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ]; opus_int32 C_last_row[ SILK_MAX_ORDER_LPC ]; opus_int32 Af_QA[ SILK_MAX_ORDER_LPC ]; opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ]; opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ]; opus_int32 xcorr[ SILK_MAX_ORDER_LPC ]; __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210; __m128i CONST1 = _mm_set1_epi32(1); silk_assert(subfr_length * nb_subfr <= MAX_FRAME_SIZE); /* Compute autocorrelations, added over subframes */ silk_sum_sqr_shift(&C0, &rshifts, x, nb_subfr * subfr_length); if(rshifts > MAX_RSHIFTS) { C0 = silk_LSHIFT32(C0, rshifts - MAX_RSHIFTS); silk_assert(C0 > 0); rshifts = MAX_RSHIFTS; } else { lz = silk_CLZ32(C0) - 1; rshifts_extra = N_BITS_HEAD_ROOM - lz; if(rshifts_extra > 0) { rshifts_extra = silk_min(rshifts_extra, MAX_RSHIFTS - rshifts); C0 = silk_RSHIFT32(C0, rshifts_extra); } else { rshifts_extra = silk_max(rshifts_extra, MIN_RSHIFTS - rshifts); C0 = silk_LSHIFT32(C0, -rshifts_extra); } rshifts += rshifts_extra; } CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0) + 1; /* Q(-rshifts) */ silk_memset(C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof(opus_int32)); if(rshifts > 0) { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; for(n = 1; n < D + 1; n++) { C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64(x_ptr, x_ptr + n, subfr_length - n, arch), rshifts); } } } else { for(s = 0; s < nb_subfr; s++) { int i; opus_int32 d; x_ptr = x + s * subfr_length; celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch); for(n = 1; n < D + 1; n++) { for (i = n + subfr_length - D, d = 0; i < subfr_length; i++) d = MAC16_16(d, x_ptr[ i ], x_ptr[ i - n ]); xcorr[ n - 1 ] += d; } for(n = 1; n < D + 1; n++) { C_first_row[ n - 1 ] += silk_LSHIFT32(xcorr[ n - 1 ], -rshifts); } } } silk_memcpy(C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof(opus_int32)); /* Initialize */ CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0) + 1; /* Q(-rshifts) */ invGain_Q30 = (opus_int32)1 << 30; reached_max_gain = 0; for(n = 0; n < D; n++) { /* Update first row of correlation matrix (without first element) */ /* Update last row of correlation matrix (without last element, stored in reversed order) */ /* Update C * Af */ /* Update C * flipud(Af) (stored in reversed order) */ if(rshifts > -2) { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; x1 = -silk_LSHIFT32((opus_int32)x_ptr[ n ], 16 - rshifts); /* Q(16-rshifts) */ x2 = -silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts); /* Q(16-rshifts) */ tmp1 = silk_LSHIFT32((opus_int32)x_ptr[ n ], QA - 16); /* Q(QA-16) */ tmp2 = silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16); /* Q(QA-16) */ for(k = 0; k < n; k++) { C_first_row[ k ] = silk_SMLAWB(C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q(-rshifts) */ C_last_row[ k ] = silk_SMLAWB(C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ]); /* Q(-rshifts) */ Atmp_QA = Af_QA[ k ]; tmp1 = silk_SMLAWB(tmp1, Atmp_QA, x_ptr[ n - k - 1 ] ); /* Q(QA-16) */ tmp2 = silk_SMLAWB(tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ]); /* Q(QA-16) */ } tmp1 = silk_LSHIFT32(-tmp1, 32 - QA - rshifts); /* Q(16-rshifts) */ tmp2 = silk_LSHIFT32(-tmp2, 32 - QA - rshifts); /* Q(16-rshifts) */ for(k = 0; k <= n; k++) { CAf[ k ] = silk_SMLAWB(CAf[ k ], tmp1, x_ptr[ n - k ] ); /* Q(-rshift) */ CAb[ k ] = silk_SMLAWB(CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ]); /* Q(-rshift) */ } } } else { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; x1 = -silk_LSHIFT32((opus_int32)x_ptr[ n ], -rshifts); /* Q(-rshifts) */ x2 = -silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts); /* Q(-rshifts) */ tmp1 = silk_LSHIFT32((opus_int32)x_ptr[ n ], 17); /* Q17 */ tmp2 = silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], 17); /* Q17 */ X1_3210 = _mm_set1_epi32(x1); X2_3210 = _mm_set1_epi32(x2); TMP1_3210 = _mm_setzero_si128(); TMP2_3210 = _mm_setzero_si128(); for(k = 0; k < n - 3; k += 4) { PTR_3210 = OP_CVTEPI16_EPI32_M64(&x_ptr[ n - k - 1 - 3 ]); SUBFR_3210 = OP_CVTEPI16_EPI32_M64(&x_ptr[ subfr_length - n + k ]); FIRST_3210 = _mm_loadu_si128((__m128i *)&C_first_row[ k ]); PTR_3210 = _mm_shuffle_epi32(PTR_3210, _MM_SHUFFLE(0, 1, 2, 3)); LAST_3210 = _mm_loadu_si128((__m128i *)&C_last_row[ k ]); ATMP_3210 = _mm_loadu_si128((__m128i *)&Af_QA[ k ]); T1_3210 = _mm_mullo_epi32(PTR_3210, X1_3210); T2_3210 = _mm_mullo_epi32(SUBFR_3210, X2_3210); ATMP_3210 = _mm_srai_epi32(ATMP_3210, 7); ATMP_3210 = _mm_add_epi32(ATMP_3210, CONST1); ATMP_3210 = _mm_srai_epi32(ATMP_3210, 1); FIRST_3210 = _mm_add_epi32(FIRST_3210, T1_3210); LAST_3210 = _mm_add_epi32(LAST_3210, T2_3210); PTR_3210 = _mm_mullo_epi32(ATMP_3210, PTR_3210); SUBFR_3210 = _mm_mullo_epi32(ATMP_3210, SUBFR_3210); _mm_storeu_si128((__m128i *)&C_first_row[ k ], FIRST_3210); _mm_storeu_si128((__m128i *)&C_last_row[ k ], LAST_3210); TMP1_3210 = _mm_add_epi32(TMP1_3210, PTR_3210); TMP2_3210 = _mm_add_epi32(TMP2_3210, SUBFR_3210); } TMP1_3210 = _mm_add_epi32(TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210)); TMP2_3210 = _mm_add_epi32(TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210)); TMP1_3210 = _mm_add_epi32(TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E)); TMP2_3210 = _mm_add_epi32(TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E)); tmp1 += _mm_cvtsi128_si32(TMP1_3210); tmp2 += _mm_cvtsi128_si32(TMP2_3210); for(; k < n; k++) { C_first_row[ k ] = silk_MLA(C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q(-rshifts) */ C_last_row[ k ] = silk_MLA(C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ]); /* Q(-rshifts) */ Atmp1 = silk_RSHIFT_ROUND(Af_QA[ k ], QA - 17); /* Q17 */ tmp1 = silk_MLA(tmp1, x_ptr[ n - k - 1 ], Atmp1); /* Q17 */ tmp2 = silk_MLA(tmp2, x_ptr[ subfr_length - n + k ], Atmp1); /* Q17 */ } tmp1 = -tmp1; /* Q17 */ tmp2 = -tmp2; /* Q17 */ { __m128i xmm_tmp1, xmm_tmp2; __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1; __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1; xmm_tmp1 = _mm_set1_epi32(tmp1); xmm_tmp2 = _mm_set1_epi32(tmp2); for(k = 0; k <= n - 3; k += 4) { xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64(&x_ptr[ n - k - 3 ]); xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64(&x_ptr[ subfr_length - n + k - 1 ]); xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32(xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE(0, 1, 2, 3)); xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32(xmm_x_ptr_n_k_x2x0, -rshifts - 1); xmm_x_ptr_sub_x2x0 = _mm_slli_epi32(xmm_x_ptr_sub_x2x0, -rshifts - 1); /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/ xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32(xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE(0, 3, 2, 1)); xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32(xmm_x_ptr_sub_x2x0, _MM_SHUFFLE(0, 3, 2, 1)); xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32(xmm_x_ptr_n_k_x2x0, xmm_tmp1); xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32(xmm_x_ptr_n_k_x3x1, xmm_tmp1); xmm_x_ptr_sub_x2x0 = _mm_mul_epi32(xmm_x_ptr_sub_x2x0, xmm_tmp2); xmm_x_ptr_sub_x3x1 = _mm_mul_epi32(xmm_x_ptr_sub_x3x1, xmm_tmp2); xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64(xmm_x_ptr_n_k_x2x0, 16); xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64(xmm_x_ptr_n_k_x3x1, 16); xmm_x_ptr_sub_x2x0 = _mm_srli_epi64(xmm_x_ptr_sub_x2x0, 16); xmm_x_ptr_sub_x3x1 = _mm_slli_epi64(xmm_x_ptr_sub_x3x1, 16); xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16(xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC); xmm_x_ptr_sub_x2x0 = _mm_blend_epi16(xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC); X1_3210 = _mm_loadu_si128((__m128i *)&CAf[ k ]); PTR_3210 = _mm_loadu_si128((__m128i *)&CAb[ k ]); X1_3210 = _mm_add_epi32(X1_3210, xmm_x_ptr_n_k_x2x0); PTR_3210 = _mm_add_epi32(PTR_3210, xmm_x_ptr_sub_x2x0); _mm_storeu_si128((__m128i *)&CAf[ k ], X1_3210); _mm_storeu_si128((__m128i *)&CAb[ k ], PTR_3210); } for(; k <= n; k++) { CAf[ k ] = silk_SMLAWW(CAf[ k ], tmp1, silk_LSHIFT32((opus_int32)x_ptr[ n - k ], -rshifts - 1)); /* Q(-rshift) */ CAb[ k ] = silk_SMLAWW(CAb[ k ], tmp2, silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1)); /* Q(-rshift) */ } } } } /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */ tmp1 = C_first_row[ n ]; /* Q(-rshifts) */ tmp2 = C_last_row[ n ]; /* Q(-rshifts) */ num = 0; /* Q(-rshifts) */ nrg = silk_ADD32(CAb[ 0 ], CAf[ 0 ]); /* Q(1-rshifts) */ for(k = 0; k < n; k++) { Atmp_QA = Af_QA[ k ]; lz = silk_CLZ32(silk_abs(Atmp_QA)) - 1; lz = silk_min(32 - QA, lz); Atmp1 = silk_LSHIFT32(Atmp_QA, lz); /* Q(QA + lz) */ tmp1 = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(C_last_row[ n - k - 1 ], Atmp1), 32 - QA - lz); /* Q(-rshifts) */ tmp2 = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(C_first_row[ n - k - 1 ], Atmp1), 32 - QA - lz); /* Q(-rshifts) */ num = silk_ADD_LSHIFT32(num, silk_SMMUL(CAb[ n - k ], Atmp1), 32 - QA - lz); /* Q(-rshifts) */ nrg = silk_ADD_LSHIFT32(nrg, silk_SMMUL(silk_ADD32(CAb[ k + 1 ], CAf[ k + 1 ]), Atmp1), 32 - QA - lz); /* Q(1-rshifts) */ } CAf[ n + 1 ] = tmp1; /* Q(-rshifts) */ CAb[ n + 1 ] = tmp2; /* Q(-rshifts) */ num = silk_ADD32(num, tmp2); /* Q(-rshifts) */ num = silk_LSHIFT32(-num, 1); /* Q(1-rshifts) */ /* Calculate the next order reflection (parcor) coefficient */ if(silk_abs(num) < nrg) { rc_Q31 = silk_DIV32_varQ(num, nrg, 31); } else { rc_Q31 = (num > 0) ? silk_int32_MAX : silk_int32_MIN; } /* Update inverse prediction gain */ tmp1 = ((opus_int32)1 << 30) - silk_SMMUL(rc_Q31, rc_Q31); tmp1 = silk_LSHIFT(silk_SMMUL(invGain_Q30, tmp1), 2); if(tmp1 <= minInvGain_Q30) { /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */ tmp2 = ((opus_int32)1 << 30) - silk_DIV32_varQ(minInvGain_Q30, invGain_Q30, 30); /* Q30 */ rc_Q31 = silk_SQRT_APPROX(tmp2); /* Q15 */ /* Newton-Raphson iteration */ rc_Q31 = silk_RSHIFT32(rc_Q31 + silk_DIV32(tmp2, rc_Q31), 1); /* Q15 */ rc_Q31 = silk_LSHIFT32(rc_Q31, 16); /* Q31 */ if(num < 0) { /* Ensure adjusted reflection coefficients has the original sign */ rc_Q31 = -rc_Q31; } invGain_Q30 = minInvGain_Q30; reached_max_gain = 1; } else { invGain_Q30 = tmp1; } /* Update the AR coefficients */ for(k = 0; k < (n + 1) >> 1; k++) { tmp1 = Af_QA[ k ]; /* QA */ tmp2 = Af_QA[ n - k - 1 ]; /* QA */ Af_QA[ k ] = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(tmp2, rc_Q31), 1); /* QA */ Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(tmp1, rc_Q31), 1); /* QA */ } Af_QA[ n ] = silk_RSHIFT32(rc_Q31, 31 - QA); /* QA */ if(reached_max_gain) { /* Reached max prediction gain; set remaining coefficients to zero and exit loop */ for(k = n + 1; k < D; k++) { Af_QA[ k ] = 0; } break; } /* Update C * Af and C * Ab */ for(k = 0; k <= n + 1; k++) { tmp1 = CAf[ k ]; /* Q(-rshifts) */ tmp2 = CAb[ n - k + 1 ]; /* Q(-rshifts) */ CAf[ k ] = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(tmp2, rc_Q31), 1); /* Q(-rshifts) */ CAb[ n - k + 1 ] = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(tmp1, rc_Q31), 1); /* Q(-rshifts) */ } } if(reached_max_gain) { for(k = 0; k < D; k++) { /* Scale coefficients */ A_Q16[ k ] = -silk_RSHIFT_ROUND(Af_QA[ k ], QA - 16); } /* Subtract energy of preceding samples from C0 */ if(rshifts > 0) { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; C0 -= (opus_int32)silk_RSHIFT64(silk_inner_prod16_aligned_64(x_ptr, x_ptr, D, arch), rshifts); } } else { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; C0 -= silk_LSHIFT32(silk_inner_prod_aligned(x_ptr, x_ptr, D, arch), -rshifts); } } /* Approximate residual energy */ *res_nrg = silk_LSHIFT(silk_SMMUL(invGain_Q30, C0), 2); *res_nrg_Q = -rshifts; } else { /* Return residual energy */ nrg = CAf[ 0 ]; /* Q(-rshifts) */ tmp1 = (opus_int32)1 << 16; /* Q16 */ for(k = 0; k < D; k++) { Atmp1 = silk_RSHIFT_ROUND(Af_QA[ k ], QA - 16); /* Q16 */ nrg = silk_SMLAWW(nrg, CAf[ k + 1 ], Atmp1); /* Q(-rshifts) */ tmp1 = silk_SMLAWW(tmp1, Atmp1, Atmp1); /* Q16 */ A_Q16[ k ] = -Atmp1; } *res_nrg = silk_SMLAWW(nrg, silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0), -tmp1);/* Q(-rshifts) */ *res_nrg_Q = -rshifts; } }
__m128i test_mm_mul_epi32(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_mul_epi32 // CHECK: call <2 x i64> @llvm.x86.sse41.pmuldq // CHECK-ASM: pmuldq %xmm{{.*}}, %xmm{{.*}} return _mm_mul_epi32(x, y); }
BOOST_FORCEINLINE __m128i __vectorcall operator * ( __m128i const left, __m128i const right ) { return _mm_mul_epi32( left, right ); }
__m128i test_mm_mul_epi32(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_mul_epi32 // CHECK: call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm_mul_epi32(x, y); }
/* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */ void silk_VQ_WMat_EC_sse4_1( opus_int8 *ind, /* O index of best codebook vector */ opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */ opus_int *gain_Q7, /* O sum of absolute LTP coefficients */ const opus_int16 *in_Q14, /* I input vector to be quantized */ const opus_int32 *W_Q18, /* I weighting matrix */ const opus_int8 *cb_Q7, /* I codebook */ const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */ const opus_uint8 *cl_Q5, /* I code length for each codebook vector */ const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */ const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */ opus_int L /* I number of vectors in codebook */ ) { opus_int k, gain_tmp_Q7; const opus_int8 *cb_row_Q7; opus_int16 diff_Q14[ 5 ]; opus_int32 sum1_Q14, sum2_Q16; __m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5; /* Loop over codebook */ *rate_dist_Q14 = silk_int32_MAX; cb_row_Q7 = cb_Q7; for( k = 0; k < L; k++ ) { gain_tmp_Q7 = cb_gain_Q7[k]; diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 ); C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] ); C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] ); C_tmp2 = _mm_slli_epi32( C_tmp2, 7 ); C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 ); diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 ); diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 ); diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 ); diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 ); /* Weighted rate */ sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] ); /* Penalty for too large gain */ sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 ); silk_assert( sum1_Q14 >= 0 ); /* first row of W_Q18 */ C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) ); C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 ); C_tmp4 = _mm_srli_si128( C_tmp4, 2 ); C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */ C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */ C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 ); C_tmp5 = _mm_srli_si128( C_tmp5, 2 ); C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 ); C_tmp5 = _mm_slli_epi32( C_tmp5, 1 ); C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) ); sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 ); sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 0 ], diff_Q14[ 0 ] ); sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 0 ] ); /* second row of W_Q18 */ sum2_Q16 = silk_SMULWB( W_Q18[ 7 ], diff_Q14[ 2 ] ); sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 8 ], diff_Q14[ 3 ] ); sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 9 ], diff_Q14[ 4 ] ); sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 ); sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 6 ], diff_Q14[ 1 ] ); sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 1 ] ); /* third row of W_Q18 */ sum2_Q16 = silk_SMULWB( W_Q18[ 13 ], diff_Q14[ 3 ] ); sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] ); sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 ); sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] ); sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 2 ] ); /* fourth row of W_Q18 */ sum2_Q16 = silk_SMULWB( W_Q18[ 19 ], diff_Q14[ 4 ] ); sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 ); sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] ); sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 3 ] ); /* last row of W_Q18 */ sum2_Q16 = silk_SMULWB( W_Q18[ 24 ], diff_Q14[ 4 ] ); sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 4 ] ); silk_assert( sum1_Q14 >= 0 ); /* find best */ if( sum1_Q14 < *rate_dist_Q14 ) { *rate_dist_Q14 = sum1_Q14; *ind = (opus_int8)k; *gain_Q7 = gain_tmp_Q7; } /* Go to next cbk vector */ cb_row_Q7 += LTP_ORDER; } }