int16_t WebRtcSpl_GetScalingSquare(int16_t* in_vector, int in_vector_length, int times) { int16_t nbits = WebRtcSpl_GetSizeInBits(times); int i; int16_t smax = -1; int16_t sabs; int16_t *sptr = in_vector; int16_t t; int looptimes = in_vector_length; for (i = looptimes; i > 0; i--) { sabs = (*sptr > 0 ? *sptr++ : -*sptr++); smax = (sabs > smax ? sabs : smax); } t = WebRtcSpl_NormW32(WEBRTC_SPL_MUL(smax, smax)); if (smax == 0) { return 0; // Since norm(0) returns 0 } else { return (t > nbits) ? 0 : nbits - t; } }
int WebRtcSpl_AutoCorrelation(const int16_t* in_vector, int in_vector_length, int order, int32_t* result, int* scale) { int32_t sum = 0; int i = 0, j = 0; int16_t smax = 0; int scaling = 0; if (order > in_vector_length) { /* Undefined */ return -1; } else if (order < 0) { order = in_vector_length; } // Find the maximum absolute value of the samples. smax = WebRtcSpl_MaxAbsValueW16(in_vector, in_vector_length); // In order to avoid overflow when computing the sum we should scale the // samples so that (in_vector_length * smax * smax) will not overflow. if (smax == 0) { scaling = 0; } else { // Number of bits in the sum loop. int nbits = WebRtcSpl_GetSizeInBits((uint32_t)in_vector_length); // Number of bits to normalize smax. int t = WebRtcSpl_NormW32(WEBRTC_SPL_MUL(smax, smax)); if (t > nbits) { scaling = 0; } else { scaling = nbits - t; } } // Perform the actual correlation calculation. for (i = 0; i < order + 1; i++) { sum = 0; /* Unroll the loop to improve performance. */ for (j = 0; i + j + 3 < in_vector_length; j += 4) { sum += (in_vector[j + 0] * in_vector[i + j + 0]) >> scaling; sum += (in_vector[j + 1] * in_vector[i + j + 1]) >> scaling; sum += (in_vector[j + 2] * in_vector[i + j + 2]) >> scaling; sum += (in_vector[j + 3] * in_vector[i + j + 3]) >> scaling; } for (; j < in_vector_length - i; j++) { sum += (in_vector[j] * in_vector[i + j]) >> scaling; } *result++ = sum; } *scale = scaling; return order + 1; }
// Compute speech/noise probability // speech/noise probability is returned in: probSpeechFinal //snrLocPrior is the prior SNR for each frequency (in Q11) //snrLocPost is the post SNR for each frequency (in Q11) void WebRtcNsx_SpeechNoiseProb(NsxInst_t* inst, uint16_t* nonSpeechProbFinal, uint32_t* priorLocSnr, uint32_t* postLocSnr) { uint32_t zeros, num, den, tmpU32no1, tmpU32no2, tmpU32no3; int32_t invLrtFX, indPriorFX, tmp32, tmp32no1, tmp32no2, besselTmpFX32; int32_t frac32, logTmp; int32_t logLrtTimeAvgKsumFX; int16_t indPriorFX16; int16_t tmp16, tmp16no1, tmp16no2, tmpIndFX, tableIndex, frac, intPart; int i, normTmp, normTmp2, nShifts; // compute feature based on average LR factor // this is the average over all frequencies of the smooth log LRT logLrtTimeAvgKsumFX = 0; for (i = 0; i < inst->magnLen; i++) { besselTmpFX32 = (int32_t)postLocSnr[i]; // Q11 normTmp = WebRtcSpl_NormU32(postLocSnr[i]); num = WEBRTC_SPL_LSHIFT_U32(postLocSnr[i], normTmp); // Q(11+normTmp) if (normTmp > 10) { den = WEBRTC_SPL_LSHIFT_U32(priorLocSnr[i], normTmp - 11); // Q(normTmp) } else { den = WEBRTC_SPL_RSHIFT_U32(priorLocSnr[i], 11 - normTmp); // Q(normTmp) } if (den > 0) { besselTmpFX32 -= WEBRTC_SPL_UDIV(num, den); // Q11 } else { besselTmpFX32 -= num; // Q11 } // inst->logLrtTimeAvg[i] += LRT_TAVG * (besselTmp - log(snrLocPrior) // - inst->logLrtTimeAvg[i]); // Here, LRT_TAVG = 0.5 zeros = WebRtcSpl_NormU32(priorLocSnr[i]); frac32 = (int32_t)(((priorLocSnr[i] << zeros) & 0x7FFFFFFF) >> 19); tmp32 = WEBRTC_SPL_MUL(frac32, frac32); tmp32 = WEBRTC_SPL_RSHIFT_W32(WEBRTC_SPL_MUL(tmp32, -43), 19); tmp32 += WEBRTC_SPL_MUL_16_16_RSFT((int16_t)frac32, 5412, 12); frac32 = tmp32 + 37; // tmp32 = log2(priorLocSnr[i]) tmp32 = (int32_t)(((31 - zeros) << 12) + frac32) - (11 << 12); // Q12 logTmp = WEBRTC_SPL_RSHIFT_W32(WEBRTC_SPL_MUL_32_16(tmp32, 178), 8); // log2(priorLocSnr[i])*log(2) tmp32no1 = WEBRTC_SPL_RSHIFT_W32(logTmp + inst->logLrtTimeAvgW32[i], 1); // Q12 inst->logLrtTimeAvgW32[i] += (besselTmpFX32 - tmp32no1); // Q12 logLrtTimeAvgKsumFX += inst->logLrtTimeAvgW32[i]; // Q12 } inst->featureLogLrt = WEBRTC_SPL_RSHIFT_W32(logLrtTimeAvgKsumFX * 5, inst->stages + 10); // 5 = BIN_SIZE_LRT / 2 // done with computation of LR factor // //compute the indicator functions // // average LRT feature // FLOAT code // indicator0 = 0.5 * (tanh(widthPrior * // (logLrtTimeAvgKsum - threshPrior0)) + 1.0); tmpIndFX = 16384; // Q14(1.0) tmp32no1 = logLrtTimeAvgKsumFX - inst->thresholdLogLrt; // Q12 nShifts = 7 - inst->stages; // WIDTH_PR_MAP_SHIFT - inst->stages + 5; //use larger width in tanh map for pause regions if (tmp32no1 < 0) { tmpIndFX = 0; tmp32no1 = -tmp32no1; //widthPrior = widthPrior * 2.0; nShifts++; } tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, nShifts); // Q14 // compute indicator function: sigmoid map tableIndex = (int16_t)WEBRTC_SPL_RSHIFT_W32(tmp32no1, 14); if ((tableIndex < 16) && (tableIndex >= 0)) { tmp16no2 = kIndicatorTable[tableIndex]; tmp16no1 = kIndicatorTable[tableIndex + 1] - kIndicatorTable[tableIndex]; frac = (int16_t)(tmp32no1 & 0x00003fff); // Q14 tmp16no2 += (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, frac, 14); if (tmpIndFX == 0) { tmpIndFX = 8192 - tmp16no2; // Q14 } else { tmpIndFX = 8192 + tmp16no2; // Q14 } } indPriorFX = WEBRTC_SPL_MUL_16_16(inst->weightLogLrt, tmpIndFX); // 6*Q14 //spectral flatness feature if (inst->weightSpecFlat) { tmpU32no1 = WEBRTC_SPL_UMUL(inst->featureSpecFlat, 400); // Q10 tmpIndFX = 16384; // Q14(1.0) //use larger width in tanh map for pause regions tmpU32no2 = inst->thresholdSpecFlat - tmpU32no1; //Q10 nShifts = 4; if (inst->thresholdSpecFlat < tmpU32no1) { tmpIndFX = 0; tmpU32no2 = tmpU32no1 - inst->thresholdSpecFlat; //widthPrior = widthPrior * 2.0; nShifts++; } tmp32no1 = (int32_t)WebRtcSpl_DivU32U16(WEBRTC_SPL_LSHIFT_U32(tmpU32no2, nShifts), 25); //Q14 tmpU32no1 = WebRtcSpl_DivU32U16(WEBRTC_SPL_LSHIFT_U32(tmpU32no2, nShifts), 25); //Q14 // compute indicator function: sigmoid map // FLOAT code // indicator1 = 0.5 * (tanh(sgnMap * widthPrior * // (threshPrior1 - tmpFloat1)) + 1.0); tableIndex = (int16_t)WEBRTC_SPL_RSHIFT_U32(tmpU32no1, 14); if (tableIndex < 16) { tmp16no2 = kIndicatorTable[tableIndex]; tmp16no1 = kIndicatorTable[tableIndex + 1] - kIndicatorTable[tableIndex]; frac = (int16_t)(tmpU32no1 & 0x00003fff); // Q14 tmp16no2 += (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, frac, 14); if (tmpIndFX) { tmpIndFX = 8192 + tmp16no2; // Q14 } else { tmpIndFX = 8192 - tmp16no2; // Q14 } } indPriorFX += WEBRTC_SPL_MUL_16_16(inst->weightSpecFlat, tmpIndFX); // 6*Q14 } //for template spectral-difference if (inst->weightSpecDiff) { tmpU32no1 = 0; if (inst->featureSpecDiff) { normTmp = WEBRTC_SPL_MIN(20 - inst->stages, WebRtcSpl_NormU32(inst->featureSpecDiff)); tmpU32no1 = WEBRTC_SPL_LSHIFT_U32(inst->featureSpecDiff, normTmp); // Q(normTmp-2*stages) tmpU32no2 = WEBRTC_SPL_RSHIFT_U32(inst->timeAvgMagnEnergy, 20 - inst->stages - normTmp); if (tmpU32no2 > 0) { // Q(20 - inst->stages) tmpU32no1 = WEBRTC_SPL_UDIV(tmpU32no1, tmpU32no2); } else { tmpU32no1 = (uint32_t)(0x7fffffff); } } tmpU32no3 = WEBRTC_SPL_UDIV(WEBRTC_SPL_LSHIFT_U32(inst->thresholdSpecDiff, 17), 25); tmpU32no2 = tmpU32no1 - tmpU32no3; nShifts = 1; tmpIndFX = 16384; // Q14(1.0) //use larger width in tanh map for pause regions if (tmpU32no2 & 0x80000000) { tmpIndFX = 0; tmpU32no2 = tmpU32no3 - tmpU32no1; //widthPrior = widthPrior * 2.0; nShifts--; } tmpU32no1 = WEBRTC_SPL_RSHIFT_U32(tmpU32no2, nShifts); // compute indicator function: sigmoid map /* FLOAT code indicator2 = 0.5 * (tanh(widthPrior * (tmpFloat1 - threshPrior2)) + 1.0); */ tableIndex = (int16_t)WEBRTC_SPL_RSHIFT_U32(tmpU32no1, 14); if (tableIndex < 16) { tmp16no2 = kIndicatorTable[tableIndex]; tmp16no1 = kIndicatorTable[tableIndex + 1] - kIndicatorTable[tableIndex]; frac = (int16_t)(tmpU32no1 & 0x00003fff); // Q14 tmp16no2 += (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( tmp16no1, frac, 14); if (tmpIndFX) { tmpIndFX = 8192 + tmp16no2; } else { tmpIndFX = 8192 - tmp16no2; } } indPriorFX += WEBRTC_SPL_MUL_16_16(inst->weightSpecDiff, tmpIndFX); // 6*Q14 } //combine the indicator function with the feature weights // FLOAT code // indPrior = 1 - (weightIndPrior0 * indicator0 + weightIndPrior1 * // indicator1 + weightIndPrior2 * indicator2); indPriorFX16 = WebRtcSpl_DivW32W16ResW16(98307 - indPriorFX, 6); // Q14 // done with computing indicator function //compute the prior probability // FLOAT code // inst->priorNonSpeechProb += PRIOR_UPDATE * // (indPriorNonSpeech - inst->priorNonSpeechProb); tmp16 = indPriorFX16 - inst->priorNonSpeechProb; // Q14 inst->priorNonSpeechProb += (int16_t)WEBRTC_SPL_MUL_16_16_RSFT( PRIOR_UPDATE_Q14, tmp16, 14); // Q14 //final speech probability: combine prior model with LR factor: memset(nonSpeechProbFinal, 0, sizeof(uint16_t) * inst->magnLen); if (inst->priorNonSpeechProb > 0) { for (i = 0; i < inst->magnLen; i++) { // FLOAT code // invLrt = exp(inst->logLrtTimeAvg[i]); // invLrt = inst->priorSpeechProb * invLrt; // nonSpeechProbFinal[i] = (1.0 - inst->priorSpeechProb) / // (1.0 - inst->priorSpeechProb + invLrt); // invLrt = (1.0 - inst->priorNonSpeechProb) * invLrt; // nonSpeechProbFinal[i] = inst->priorNonSpeechProb / // (inst->priorNonSpeechProb + invLrt); if (inst->logLrtTimeAvgW32[i] < 65300) { tmp32no1 = WEBRTC_SPL_RSHIFT_W32(WEBRTC_SPL_MUL( inst->logLrtTimeAvgW32[i], 23637), 14); // Q12 intPart = (int16_t)WEBRTC_SPL_RSHIFT_W32(tmp32no1, 12); if (intPart < -8) { intPart = -8; } frac = (int16_t)(tmp32no1 & 0x00000fff); // Q12 // Quadratic approximation of 2^frac tmp32no2 = WEBRTC_SPL_RSHIFT_W32(frac * frac * 44, 19); // Q12 tmp32no2 += WEBRTC_SPL_MUL_16_16_RSFT(frac, 84, 7); // Q12 invLrtFX = WEBRTC_SPL_LSHIFT_W32(1, 8 + intPart) + WEBRTC_SPL_SHIFT_W32(tmp32no2, intPart - 4); // Q8 normTmp = WebRtcSpl_NormW32(invLrtFX); normTmp2 = WebRtcSpl_NormW16((16384 - inst->priorNonSpeechProb)); if (normTmp + normTmp2 >= 7) { if (normTmp + normTmp2 < 15) { invLrtFX = WEBRTC_SPL_RSHIFT_W32(invLrtFX, 15 - normTmp2 - normTmp); // Q(normTmp+normTmp2-7) tmp32no1 = WEBRTC_SPL_MUL_32_16(invLrtFX, (16384 - inst->priorNonSpeechProb)); // Q(normTmp+normTmp2+7) invLrtFX = WEBRTC_SPL_SHIFT_W32(tmp32no1, 7 - normTmp - normTmp2); // Q14 } else { tmp32no1 = WEBRTC_SPL_MUL_32_16(invLrtFX, (16384 - inst->priorNonSpeechProb)); // Q22 invLrtFX = WEBRTC_SPL_RSHIFT_W32(tmp32no1, 8); // Q14 } tmp32no1 = WEBRTC_SPL_LSHIFT_W32((int32_t)inst->priorNonSpeechProb, 8); // Q22 nonSpeechProbFinal[i] = (uint16_t)WEBRTC_SPL_DIV(tmp32no1, (int32_t)inst->priorNonSpeechProb + invLrtFX); // Q8 } } } } }
// Contains a function for the core loop in the normalized lattice MA // filter routine for iSAC codec, optimized for ARM Neon platform. // It does: // for 0 <= n < HALF_SUBFRAMELEN - 1: // *ptr2 = input2 * (*ptr2) + input0 * (*ptr0)); // *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); // Output is not bit-exact with the reference C code, due to the replacement // of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon // instructions. The difference should not be bigger than 1. void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0, // Filter coefficient int16_t input1, // Filter coefficient int32_t input2, // Inverse coefficient int32_t* ptr0, // Sample buffer int32_t* ptr1, // Sample buffer int32_t* ptr2) // Sample buffer { int n = 0; int loop = (HALF_SUBFRAMELEN - 1) >> 3; int loop_tail = (HALF_SUBFRAMELEN - 1) & 0x7; int32x4_t input0_v = vdupq_n_s32((int32_t)input0 << 16); int32x4_t input1_v = vdupq_n_s32((int32_t)input1 << 16); int32x4_t input2_v = vdupq_n_s32(input2); int32x4_t tmp0a, tmp1a, tmp2a, tmp3a; int32x4_t tmp0b, tmp1b, tmp2b, tmp3b; int32x4_t ptr0va, ptr1va, ptr2va; int32x4_t ptr0vb, ptr1vb, ptr2vb; // Unroll to process 8 samples at once. for (n = 0; n < loop; n++) { ptr0va = vld1q_s32(ptr0); ptr0vb = vld1q_s32(ptr0 + 4); ptr0 += 8; ptr2va = vld1q_s32(ptr2); ptr2vb = vld1q_s32(ptr2 + 4); // Calculate tmp0 = (*ptr0) * input0. tmp0a = vqrdmulhq_s32(ptr0va, input0_v); tmp0b = vqrdmulhq_s32(ptr0vb, input0_v); // Calculate tmp1 = (*ptr0) * input1. tmp1a = vqrdmulhq_s32(ptr0va, input1_v); tmp1b = vqrdmulhq_s32(ptr0vb, input1_v); // Calculate tmp2 = tmp0 + *(ptr2). tmp2a = vaddq_s32(tmp0a, ptr2va); tmp2b = vaddq_s32(tmp0b, ptr2vb); tmp2a = vshlq_n_s32(tmp2a, 15); tmp2b = vshlq_n_s32(tmp2b, 15); // Calculate *ptr2 = input2 * tmp2. ptr2va = vqrdmulhq_s32(tmp2a, input2_v); ptr2vb = vqrdmulhq_s32(tmp2b, input2_v); vst1q_s32(ptr2, ptr2va); vst1q_s32(ptr2 + 4, ptr2vb); ptr2 += 8; // Calculate tmp3 = ptr2v * input0. tmp3a = vqrdmulhq_s32(ptr2va, input0_v); tmp3b = vqrdmulhq_s32(ptr2vb, input0_v); // Calculate *ptr1 = tmp1 + tmp3. ptr1va = vaddq_s32(tmp1a, tmp3a); ptr1vb = vaddq_s32(tmp1b, tmp3b); vst1q_s32(ptr1, ptr1va); vst1q_s32(ptr1 + 4, ptr1vb); ptr1 += 8; } // Process four more samples. if (loop_tail & 0x4) { ptr0va = vld1q_s32(ptr0); ptr2va = vld1q_s32(ptr2); ptr0 += 4; // Calculate tmp0 = (*ptr0) * input0. tmp0a = vqrdmulhq_s32(ptr0va, input0_v); // Calculate tmp1 = (*ptr0) * input1. tmp1a = vqrdmulhq_s32(ptr0va, input1_v); // Calculate tmp2 = tmp0 + *(ptr2). tmp2a = vaddq_s32(tmp0a, ptr2va); tmp2a = vshlq_n_s32(tmp2a, 15); // Calculate *ptr2 = input2 * tmp2. ptr2va = vqrdmulhq_s32(tmp2a, input2_v); vst1q_s32(ptr2, ptr2va); ptr2 += 4; // Calculate tmp3 = *(ptr2) * input0. tmp3a = vqrdmulhq_s32(ptr2va, input0_v); // Calculate *ptr1 = tmp1 + tmp3. ptr1va = vaddq_s32(tmp1a, tmp3a); vst1q_s32(ptr1, ptr1va); ptr1 += 4; } // Process two more samples. if (loop_tail & 0x2) { int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail; int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail; ptr0v_tail = vld1_s32(ptr0); ptr2v_tail = vld1_s32(ptr2); ptr0 += 2; // Calculate tmp0 = (*ptr0) * input0. tmp0_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input0_v)); // Calculate tmp1 = (*ptr0) * input1. tmp1_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input1_v)); // Calculate tmp2 = tmp0 + *(ptr2). tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail); tmp2_tail = vshl_n_s32(tmp2_tail, 15); // Calculate *ptr2 = input2 * tmp2. ptr2v_tail = vqrdmulh_s32(tmp2_tail, vget_low_s32(input2_v)); vst1_s32(ptr2, ptr2v_tail); ptr2 += 2; // Calculate tmp3 = *(ptr2) * input0. tmp3_tail = vqrdmulh_s32(ptr2v_tail, vget_low_s32(input0_v)); // Calculate *ptr1 = tmp1 + tmp3. ptr1v_tail = vadd_s32(tmp1_tail, tmp3_tail); vst1_s32(ptr1, ptr1v_tail); ptr1 += 2; } // Process one more sample. if (loop_tail & 0x1) { int16_t t16a = (int16_t)(input2 >> 16); int16_t t16b = (int16_t)input2; if (t16b < 0) t16a++; int32_t tmp32a; int32_t tmp32b; // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)). tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0); tmp32b = *ptr2 + tmp32a; *ptr2 = (int32_t)(WEBRTC_SPL_MUL(t16a, tmp32b) + (WEBRTC_SPL_MUL_16_32_RSFT16(t16b, tmp32b))); // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2). tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0); tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2); *ptr1 = tmp32a + tmp32b; }