/* Transform a masking curve (power spectrum) into a pole-zero filter */ void curve_to_lpc(VorbisPsy *psy, float *curve, float *awk1, float *awk2, int ord) { int i; float ac[psy->n]; float tmp; int len = psy->n >> 1; for (i=0;i<2*len;i++) ac[i] = 0; for (i=1;i<len;i++) ac[2*i-1] = curve[i]; ac[0] = curve[0]; ac[2*len-1] = curve[len-1]; spx_drft_backward(&psy->lookup, ac); _spx_lpc(awk1, ac, ord); tmp = 1.; for (i=0;i<ord;i++) { tmp *= .99; awk1[i] *= tmp; } #if 0 for (i=0;i<ord;i++) awk2[i] = 0; #else /* Use the second (awk2) filter to correct the first one */ for (i=0;i<2*len;i++) ac[i] = 0; for (i=0;i<ord;i++) ac[i+1] = awk1[i]; ac[0] = 1; spx_drft_forward(&psy->lookup, ac); /* Compute (power) response of awk1 (all zero) */ ac[0] *= ac[0]; for (i=1;i<len;i++) ac[i] = ac[2*i-1]*ac[2*i-1] + ac[2*i]*ac[2*i]; ac[len] = ac[2*len-1]*ac[2*len-1]; /* Compute correction required */ for (i=0;i<len;i++) curve[i] = 1. / (1e-6f+curve[i]*ac[i]); for (i=0;i<2*len;i++) ac[i] = 0; for (i=1;i<len;i++) ac[2*i-1] = curve[i]; ac[0] = curve[0]; ac[2*len-1] = curve[len-1]; spx_drft_backward(&psy->lookup, ac); _spx_lpc(awk2, ac, ord); tmp = 1; for (i=0;i<ord;i++) { tmp *= .99; awk2[i] *= tmp; } #endif }
/** Performs echo cancellation on a frame */ void speex_echo_cancel(SpeexEchoState *st, short *ref, short *echo, short *out, float *Yout) { int i,j,m; int N,M; float scale; float ESR; float SER; float Sry=0,Srr=0,Syy=0,Sey=0,See=0,Sxx=0; float leak_estimate; leak_estimate = .1+(.9/(1+2*st->sum_adapt)); N = st->window_size; M = st->M; scale = 1.0f/N; st->cancel_count++; /* Copy input data to buffer */ for (i=0; i<st->frame_size; i++) { st->x[i] = st->x[i+st->frame_size]; st->x[i+st->frame_size] = echo[i]; st->d[i] = st->d[i+st->frame_size]; st->d[i+st->frame_size] = ref[i]; } /* Shift memory: this could be optimized eventually*/ for (i=0; i<N*(M-1); i++) st->X[i]=st->X[i+N]; /* Copy new echo frame */ for (i=0; i<N; i++) st->X[(M-1)*N+i]=st->x[i]; /* Convert x (echo input) to frequency domain */ spx_drft_forward(st->fft_lookup, &st->X[(M-1)*N]); /* Compute filter response Y */ for (i=0; i<N; i++) st->Y[i] = 0; for (j=0; j<M; j++) spectral_mul_accum(&st->X[j*N], &st->W[j*N], st->Y, N); /* Convert Y (filter response) to time domain */ for (i=0; i<N; i++) st->y[i] = st->Y[i]; spx_drft_backward(st->fft_lookup, st->y); for (i=0; i<N; i++) st->y[i] *= scale; /* Transform d (reference signal) to frequency domain */ for (i=0; i<N; i++) st->D[i]=st->d[i]; spx_drft_forward(st->fft_lookup, st->D); /* Compute error signal (signal with echo removed) */ for (i=0; i<st->frame_size; i++) { float tmp_out; tmp_out = (float)ref[i] - st->y[i+st->frame_size]; st->E[i] = 0; st->E[i+st->frame_size] = tmp_out; /* Saturation */ if (tmp_out>32767) tmp_out = 32767; else if (tmp_out<-32768) tmp_out = -32768; out[i] = tmp_out; } /* This bit of code provides faster adaptation by doing a projection of the previous gradient on the "MMSE surface" */ if (1) { float Sge, Sgg, Syy; float gain; Syy = inner_prod(st->y+st->frame_size, st->y+st->frame_size, st->frame_size); for (i=0; i<N; i++) st->Y2[i] = 0; for (j=0; j<M; j++) spectral_mul_accum(&st->X[j*N], &st->PHI[j*N], st->Y2, N); for (i=0; i<N; i++) st->y2[i] = st->Y2[i]; spx_drft_backward(st->fft_lookup, st->y2); for (i=0; i<N; i++) st->y2[i] *= scale; Sge = inner_prod(st->y2+st->frame_size, st->E+st->frame_size, st->frame_size); Sgg = inner_prod(st->y2+st->frame_size, st->y2+st->frame_size, st->frame_size); /* Compute projection gain */ gain = Sge/(N+.03*Syy+Sgg); if (gain>2) gain = 2; if (gain < -2) gain = -2; /* Apply gain to weights, echo estimates, output */ for (i=0; i<N; i++) st->Y[i] += gain*st->Y2[i]; for (i=0; i<st->frame_size; i++) { st->y[i+st->frame_size] += gain*st->y2[i+st->frame_size]; st->E[i+st->frame_size] -= gain*st->y2[i+st->frame_size]; } for (i=0; i<M*N; i++) st->W[i] += gain*st->PHI[i]; } /* Compute power spectrum of output (D-Y) and filter response (Y) */ for (i=0; i<N; i++) st->D[i] -= st->Y[i]; power_spectrum(st->D, st->Rf, N); power_spectrum(st->Y, st->Yf, N); /* Compute frequency-domain adaptation mask */ for (j=0; j<=st->frame_size; j++) { float r; r = leak_estimate*st->Yf[j] / (1+st->Rf[j]); if (r>1) r = 1; st->fratio[j] = r; } /* Compute a bunch of correlations */ Sry = inner_prod(st->y+st->frame_size, st->d+st->frame_size, st->frame_size); Sey = inner_prod(st->y+st->frame_size, st->E+st->frame_size, st->frame_size); See = inner_prod(st->E+st->frame_size, st->E+st->frame_size, st->frame_size); Syy = inner_prod(st->y+st->frame_size, st->y+st->frame_size, st->frame_size); Srr = inner_prod(st->d+st->frame_size, st->d+st->frame_size, st->frame_size); Sxx = inner_prod(st->x+st->frame_size, st->x+st->frame_size, st->frame_size); /* Compute smoothed cross-correlation and energy */ st->Sey = .98*st->Sey + .02*Sey; st->Syy = .98*st->Syy + .02*Syy; st->See = .98*st->See + .02*See; /* Check if filter is completely mis-adapted (if so, reset filter) */ if (st->Sey/(1+st->Syy + .01*st->See) < -1) { /*fprintf (stderr, "reset at %d\n", st->cancel_count);*/ speex_echo_state_reset(st); return; } SER = Srr / (1+Sxx); ESR = leak_estimate*Syy / (1+See); if (ESR>1) ESR = 1; #if 1 /* If over-cancellation (creating echo with 180 phase) damp filter */ if (st->Sey/(1+st->Syy) < -.1 && (ESR > .3)) { for (i=0; i<M*N; i++) st->W[i] *= .95; st->Sey *= .5; /*fprintf (stderr, "corrected down\n");*/ } #endif #if 1 /* If under-cancellation (leaving echo with 0 phase) scale filter up */ if (st->Sey/(1+st->Syy) > .1 && (ESR > .1 || SER < 10)) { for (i=0; i<M*N; i++) st->W[i] *= 1.05; st->Sey *= .5; /*fprintf (stderr, "corrected up %d\n", st->cancel_count);*/ } #endif /* We consider that the filter is adapted if the following is true*/ if (ESR>.6 && st->sum_adapt > 1) { /*if (!st->adapted) fprintf(stderr, "Adapted at %d %f\n", st->cancel_count, st->sum_adapt);*/ st->adapted = 1; } /* Update frequency-dependent energy ratio with the total energy ratio */ for (i=0; i<=st->frame_size; i++) { st->fratio[i] = (.2*ESR+.8*min(.005+ESR,st->fratio[i])); } if (st->adapted) { st->adapt_rate = .95f/(2+M); } else { /* Temporary adaption rate if filter is not adapted correctly */ if (SER<.1) st->adapt_rate =.8/(2+M); else if (SER<1) st->adapt_rate =.4/(2+M); else if (SER<10) st->adapt_rate =.2/(2+M); else if (SER<30) st->adapt_rate =.08/(2+M); else st->adapt_rate = 0; } /* How much have we adapted so far? */ st->sum_adapt += st->adapt_rate; /* Compute echo power in each frequency bin */ { float ss = 1.0f/st->cancel_count; if (ss < .3/M) ss=.3/M; power_spectrum(&st->X[(M-1)*N], st->Xf, N); /* Smooth echo energy estimate over time */ for (j=0; j<=st->frame_size; j++) st->power[j] = (1-ss)*st->power[j] + ss*st->Xf[j]; /* Combine adaptation rate to the the inverse energy estimate */ if (st->adapted) { /* If filter is adapted, include the frequency-dependent ratio too */ for (i=0; i<=st->frame_size; i++) st->power_1[i] = st->adapt_rate*st->fratio[i] /(1.f+st->power[i]); } else { for (i=0; i<=st->frame_size; i++) st->power_1[i] = st->adapt_rate/(1.f+st->power[i]); } } /* Convert error to frequency domain */ spx_drft_forward(st->fft_lookup, st->E); /* Do some regularization (prevents problems when system is ill-conditoned) */ for (m=0; m<M; m++) for (i=0; i<N; i++) st->W[m*N+i] *= 1-st->regul[i]*ESR; /* Compute weight gradient */ for (j=0; j<M; j++) { weighted_spectral_mul_conj(st->power_1, &st->X[j*N], st->E, st->PHI+N*j, N); } /* Gradient descent */ for (i=0; i<M*N; i++) st->W[i] += st->PHI[i]; /* AUMDF weight constraint */ for (j=0; j<M; j++) { /* Remove the "if" to make this an MDF filter */ if (st->cancel_count%M == j) { spx_drft_backward(st->fft_lookup, &st->W[j*N]); for (i=0; i<N; i++) st->W[j*N+i]*=scale; for (i=st->frame_size; i<N; i++) { st->W[j*N+i]=0; } spx_drft_forward(st->fft_lookup, &st->W[j*N]); } } /* Compute spectrum of estimated echo for use in an echo post-filter (if necessary)*/ if (Yout) { if (st->adapted) { /* If the filter is adapted, take the filtered echo */ for (i=0; i<st->frame_size; i++) st->last_y[i] = st->last_y[st->frame_size+i]; for (i=0; i<st->frame_size; i++) st->last_y[st->frame_size+i] = st->y[st->frame_size+i]; } else { /* If filter isn't adapted yet, all we can do is take the echo signal directly */ for (i=0; i<N; i++) st->last_y[i] = st->x[i]; } /* Apply hanning window (should pre-compute it)*/ for (i=0; i<N; i++) st->Yps[i] = (.5-.5*cos(2*M_PI*i/N))*st->last_y[i]; /* Compute power spectrum of the echo */ spx_drft_forward(st->fft_lookup, st->Yps); power_spectrum(st->Yps, st->Yps, N); /* Estimate residual echo */ for (i=0; i<=st->frame_size; i++) Yout[i] = 2*leak_estimate*st->Yps[i]; } }
int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, float *echo) { int i; int is_speech=1; float mean_post=0; float mean_prior=0; int N = st->ps_size; int N3 = 2*N - st->frame_size; int N4 = st->frame_size - N3; float scale=.5f/N; float *ps=st->ps; float Zframe=0, Pframe; preprocess_analysis(st, x); update_noise_prob(st); st->nb_preprocess++; /* Noise estimation always updated for the 20 first times */ if (st->nb_adapt<10) { update_noise(st, ps, echo); } /* Deal with residual echo if provided */ if (echo) for (i=1;i<N;i++) st->echo_noise[i] = (.3f*st->echo_noise[i] + echo[i]); /* Compute a posteriori SNR */ for (i=1;i<N;i++) { st->post[i] = ps[i]/(1.f+NOISE_OVERCOMPENS*st->noise[i]+st->echo_noise[i]+st->reverb_estimate[i]) - 1.f; if (st->post[i]>100.f) st->post[i]=100.f; /*if (st->post[i]<0) st->post[i]=0;*/ mean_post+=st->post[i]; } mean_post /= N; if (mean_post<0.f) mean_post=0.f; /* Special case for first frame */ if (st->nb_adapt==1) for (i=1;i<N;i++) st->old_ps[i] = ps[i]; /* Compute a priori SNR */ { /* A priori update rate */ float gamma; float min_gamma=0.12f; gamma = 1.0f/st->nb_preprocess; /*Make update rate smaller when there's no speech*/ #if 0 if (mean_post<3.5 && mean_prior < 1) min_gamma *= (mean_post+.5); else min_gamma *= 4.; #else min_gamma = .1f*fabs(mean_prior - mean_post)*fabs(mean_prior - mean_post); if (min_gamma>.15f) min_gamma = .15f; if (min_gamma<.02f) min_gamma = .02f; #endif /*min_gamma = .08;*/ /*if (gamma<min_gamma)*/ gamma=min_gamma; gamma = .1; for (i=1;i<N;i++) { /* A priori SNR update */ st->prior[i] = gamma*max(0.0f,st->post[i]) + (1.f-gamma)*st->gain[i]*st->gain[i]*st->old_ps[i]/(1.f+NOISE_OVERCOMPENS*st->noise[i]+st->echo_noise[i]+st->reverb_estimate[i]); if (st->prior[i]>100.f) st->prior[i]=100.f; mean_prior+=st->prior[i]; } } mean_prior /= N; #if 0 for (i=0;i<N;i++) { fprintf (stderr, "%f ", st->prior[i]); } fprintf (stderr, "\n"); #endif /*fprintf (stderr, "%f %f\n", mean_prior,mean_post);*/ if (st->nb_preprocess>=20) { int do_update = 0; float noise_ener=0, sig_ener=0; /* If SNR is low (both a priori and a posteriori), update the noise estimate*/ /*if (mean_prior<.23 && mean_post < .5)*/ if (mean_prior<.23f && mean_post < .5f) do_update = 1; for (i=1;i<N;i++) { noise_ener += st->noise[i]; sig_ener += ps[i]; } if (noise_ener > 3.f*sig_ener) do_update = 1; /*do_update = 0;*/ if (do_update) { st->consec_noise++; } else { st->consec_noise=0; } } if (st->vad_enabled) is_speech = speex_compute_vad(st, ps, mean_prior, mean_post); if (st->consec_noise>=3) { update_noise(st, st->old_ps, echo); } else { for (i=1;i<N-1;i++) { if (st->update_prob[i]<.5f || st->ps[i] < st->noise[i]) { if (echo) st->noise[i] = .90f*st->noise[i] + .1f*max(1.0f,st->ps[i]-echo[i]); else st->noise[i] = .90f*st->noise[i] + .1f*st->ps[i]; } } } for (i=1;i<N;i++) { st->zeta[i] = .7f*st->zeta[i] + .3f*st->prior[i]; } { int freq_start = (int)(300.0f*2.f*N/st->sampling_rate); int freq_end = (int)(2000.0f*2.f*N/st->sampling_rate); for (i=freq_start;i<freq_end;i++) { Zframe += st->zeta[i]; } } Zframe /= N; if (Zframe<ZMIN) { Pframe = 0; } else { if (Zframe > 1.5f*st->Zlast) { Pframe = 1.f; st->Zpeak = Zframe; if (st->Zpeak > 10.f) st->Zpeak = 10.f; if (st->Zpeak < 1.f) st->Zpeak = 1.f; } else { if (Zframe < st->Zpeak*ZMIN) { Pframe = 0; } else if (Zframe > st->Zpeak*ZMAX) { Pframe = 1; } else { Pframe = log(Zframe/(st->Zpeak*ZMIN)) / log(ZMAX/ZMIN); } } } st->Zlast = Zframe; /*fprintf (stderr, "%f\n", Pframe);*/ /* Compute gain according to the Ephraim-Malah algorithm */ for (i=1;i<N;i++) { float MM; float theta; float prior_ratio; float p, q; float zeta1; float P1; prior_ratio = st->prior[i]/(1.0001f+st->prior[i]); theta = (1.f+st->post[i])*prior_ratio; if (i==1 || i==N-1) zeta1 = st->zeta[i]; else zeta1 = .25f*st->zeta[i-1] + .5f*st->zeta[i] + .25f*st->zeta[i+1]; if (zeta1<ZMIN) P1 = 0.f; else if (zeta1>ZMAX) P1 = 1.f; else P1 = LOG_MIN_MAX_1 * log(ZMIN_1*zeta1); /*P1 = log(zeta1/ZMIN)/log(ZMAX/ZMIN);*/ /* FIXME: add global prob (P2) */ q = 1-Pframe*P1; q = 1-P1; if (q>.95f) q=.95f; p=1.f/(1.f + (q/(1.f-q))*(1.f+st->prior[i])*exp(-theta)); /*p=1;*/ #if 0 /* log-spectral magnitude estimator */ if (theta<6) MM = 0.74082*pow(theta+1,.61)/sqrt(.0001+theta); else MM=1; #else /* Optimal estimator for loudness domain */ MM = hypergeom_gain(theta); #endif st->gain[i] = prior_ratio * MM; /*Put some (very arbitraty) limit on the gain*/ if (st->gain[i]>2.f) { st->gain[i]=2.f; } st->reverb_estimate[i] = st->reverb_decay*st->reverb_estimate[i] + st->reverb_decay*st->reverb_level*st->gain[i]*st->gain[i]*st->ps[i]; if (st->denoise_enabled) { st->gain2[i]=p*p*st->gain[i]; } else { st->gain2[i]=1.f; } } st->gain2[0]=st->gain[0]=0.f; st->gain2[N-1]=st->gain[N-1]=0.f; if (st->agc_enabled) speex_compute_agc(st, mean_prior); #if 0 if (!is_speech) { for (i=0;i<N;i++) st->gain2[i] = 0; } #if 0 else { for (i=0;i<N;i++) st->gain2[i] = 1; } #endif #endif /* Apply computed gain */ for (i=1;i<N;i++) { st->frame[2*i-1] *= st->gain2[i]; st->frame[2*i] *= st->gain2[i]; } /* Get rid of the DC and very low frequencies */ st->frame[0]=0; st->frame[1]=0; st->frame[2]=0; /* Nyquist frequency is mostly useless too */ st->frame[2*N-1]=0; /* Inverse FFT with 1/N scaling */ spx_drft_backward(st->fft_lookup, st->frame); for (i=0;i<2*N;i++) st->frame[i] *= scale; { float max_sample=0; for (i=0;i<2*N;i++) if (fabs(st->frame[i])>max_sample) max_sample = fabs(st->frame[i]); if (max_sample>28000.f) { float damp = 28000.f/max_sample; for (i=0;i<2*N;i++) st->frame[i] *= damp; } } for (i=0;i<2*N;i++) st->frame[i] *= st->window[i]; /* Perform overlap and add */ for (i=0;i<N3;i++) x[i] = st->outbuf[i] + st->frame[i]; for (i=0;i<N4;i++) x[N3+i] = st->frame[N3+i]; /* Update outbuf */ for (i=0;i<N3;i++) st->outbuf[i] = st->frame[st->frame_size+i]; /* Save old power spectrum */ for (i=1;i<N;i++) st->old_ps[i] = ps[i]; return is_speech; }
EXPORT void speex_decorrelate(SpeexDecorrState *st, const spx_int16_t *in, spx_int16_t *out, int strength) { int ch; float amount; if (strength<0) strength = 0; if (strength>100) strength = 100; amount = .01*strength; for (ch=0;ch<st->channels;ch++) { int i; int N=2*st->frame_size; float beta, beta2; float *x; float max_alpha = 0; float *buff; float *ring; int ringID; int order; float alpha; buff = st->buff+ch*2*st->frame_size; ring = st->ring[ch]; ringID = st->ringID[ch]; order = st->order[ch]; alpha = st->alpha[ch]; for (i=0;i<st->frame_size;i++) buff[i] = buff[i+st->frame_size]; for (i=0;i<st->frame_size;i++) buff[i+st->frame_size] = in[i*st->channels+ch]; x = buff+st->frame_size; beta = 1.-.3*amount*amount; if (amount>1) beta = 1-sqrt(.4*amount); else beta = 1-0.63246*amount; if (beta<0) beta = 0; beta2 = beta; for (i=0;i<st->frame_size;i++) { st->y[i] = alpha*(x[i-ALLPASS_ORDER+order]-beta*x[i-ALLPASS_ORDER+order-1])*st->vorbis_win[st->frame_size+i+order] + x[i-ALLPASS_ORDER]*st->vorbis_win[st->frame_size+i] - alpha*(ring[ringID] - beta*ring[ringID+1>=order?0:ringID+1]); ring[ringID++]=st->y[i]; st->y[i] *= st->vorbis_win[st->frame_size+i]; if (ringID>=order) ringID=0; } order = order+(irand(&st->seed)%3)-1; if (order < 5) order = 5; if (order > 10) order = 10; /*order = 5+(irand(&st->seed)%6);*/ max_alpha = pow(.96+.04*(amount-1),order); if (max_alpha > .98/(1.+beta2)) max_alpha = .98/(1.+beta2); alpha = alpha + .4*uni_rand(&st->seed); if (alpha > max_alpha) alpha = max_alpha; if (alpha < -max_alpha) alpha = -max_alpha; for (i=0;i<ALLPASS_ORDER;i++) ring[i] = 0; ringID = 0; for (i=0;i<st->frame_size;i++) { float tmp = alpha*(x[i-ALLPASS_ORDER+order]-beta*x[i-ALLPASS_ORDER+order-1])*st->vorbis_win[i+order] + x[i-ALLPASS_ORDER]*st->vorbis_win[i] - alpha*(ring[ringID] - beta*ring[ringID+1>=order?0:ringID+1]); ring[ringID++]=tmp; tmp *= st->vorbis_win[i]; if (ringID>=order) ringID=0; st->y[i] += tmp; } #ifdef VORBIS_PSYCHO float frame[N]; float scale = 1./N; for (i=0;i<2*st->frame_size;i++) frame[i] = buff[i]; //float coef = .5*0.78130; float coef = M_PI*0.075063 * 0.93763 * amount * .8 * 0.707; compute_curve(st->psy, buff, st->curve); for (i=1;i<st->frame_size;i++) { float x1,x2; float gain; do { x1 = uni_rand(&st->seed); x2 = uni_rand(&st->seed); } while (x1*x1+x2*x2 > 1.); gain = coef*sqrt(.1+st->curve[i]); frame[2*i-1] = gain*x1; frame[2*i] = gain*x2; } frame[0] = coef*uni_rand(&st->seed)*sqrt(.1+st->curve[0]); frame[2*st->frame_size-1] = coef*uni_rand(&st->seed)*sqrt(.1+st->curve[st->frame_size-1]); spx_drft_backward(&st->lookup,frame); for (i=0;i<2*st->frame_size;i++) frame[i] *= st->vorbis_win[i]; #endif for (i=0;i<st->frame_size;i++) { #ifdef VORBIS_PSYCHO float tmp = st->y[i] + frame[i] + st->wola_mem[i]; st->wola_mem[i] = frame[i+st->frame_size]; #else float tmp = st->y[i]; #endif if (tmp>32767) tmp = 32767; if (tmp < -32767) tmp = -32767; out[i*st->channels+ch] = tmp; } st->ringID[ch] = ringID; st->order[ch] = order; st->alpha[ch] = alpha; } }
int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo) { int i; int is_speech=1; float mean_post=0; float mean_prior=0; int N = st->ps_size; int N3 = 2*N - st->frame_size; int N4 = st->frame_size - N3; float scale=.5f/N; float *ps=st->ps; float Zframe=0, Pframe; preprocess_analysis(st, x); update_noise_prob(st); st->nb_preprocess++; /* Noise estimation always updated for the 20 first times */ if (st->nb_adapt<10) { update_noise(st, ps, echo); } /* Deal with residual echo if provided */ if (echo) for (i=1;i<N;i++) st->echo_noise[i] = (.3f*st->echo_noise[i] + st->frame_size*st->frame_size*1.0*echo[i]); /* Compute a posteriori SNR */ for (i=1;i<N;i++) { float tot_noise = 1.f+ NOISE_OVERCOMPENS*st->noise[i] + st->echo_noise[i] + st->reverb_estimate[i]; st->post[i] = ps[i]/tot_noise - 1.f; if (st->post[i]>100.f) st->post[i]=100.f; /*if (st->post[i]<0) st->post[i]=0;*/ mean_post+=st->post[i]; } mean_post /= N; if (mean_post<0.f) mean_post=0.f; /* Special case for first frame */ if (st->nb_adapt==1) for (i=1;i<N;i++) st->old_ps[i] = ps[i]; /* Compute a priori SNR */ { /* A priori update rate */ for (i=1;i<N;i++) { float gamma = .15+.85*st->prior[i]*st->prior[i]/((1+st->prior[i])*(1+st->prior[i])); float tot_noise = 1.f+ NOISE_OVERCOMPENS*st->noise[i] + st->echo_noise[i] + st->reverb_estimate[i]; /* A priori SNR update */ st->prior[i] = gamma*max(0.0f,st->post[i]) + (1.f-gamma)* (.8*st->gain[i]*st->gain[i]*st->old_ps[i]/tot_noise + .2*st->prior[i]); if (st->prior[i]>100.f) st->prior[i]=100.f; mean_prior+=st->prior[i]; } } mean_prior /= N; #if 0 for (i=0;i<N;i++) { fprintf (stderr, "%f ", st->prior[i]); } fprintf (stderr, "\n"); #endif /*fprintf (stderr, "%f %f\n", mean_prior,mean_post);*/ if (st->nb_preprocess>=20) { int do_update = 0; float noise_ener=0, sig_ener=0; /* If SNR is low (both a priori and a posteriori), update the noise estimate*/ /*if (mean_prior<.23 && mean_post < .5)*/ if (mean_prior<.23f && mean_post < .5f) do_update = 1; for (i=1;i<N;i++) { noise_ener += st->noise[i]; sig_ener += ps[i]; } if (noise_ener > 3.f*sig_ener) do_update = 1; /*do_update = 0;*/ if (do_update) { st->consec_noise++; } else { st->consec_noise=0; } } if (st->vad_enabled) is_speech = speex_compute_vad(st, ps, mean_prior, mean_post); if (st->consec_noise>=3) { update_noise(st, st->old_ps, echo); } else { for (i=1;i<N-1;i++) { if (st->update_prob[i]<.5f/* || st->ps[i] < st->noise[i]*/) { if (echo) st->noise[i] = .95f*st->noise[i] + .05f*max(1.0f,st->ps[i]-st->frame_size*st->frame_size*1.0*echo[i]); else st->noise[i] = .95f*st->noise[i] + .05f*st->ps[i]; } } } for (i=1;i<N;i++) { st->zeta[i] = .7f*st->zeta[i] + .3f*st->prior[i]; } { int freq_start = (int)(300.0f*2.f*N/st->sampling_rate); int freq_end = (int)(2000.0f*2.f*N/st->sampling_rate); for (i=freq_start;i<freq_end;i++) { Zframe += st->zeta[i]; } Zframe /= (freq_end-freq_start); } st->Zlast = Zframe; Pframe = qcurve(Zframe); /*fprintf (stderr, "%f\n", Pframe);*/ /* Compute gain according to the Ephraim-Malah algorithm */ for (i=1;i<N;i++) { float MM; float theta; float prior_ratio; float p, q; float zeta1; float P1; prior_ratio = st->prior[i]/(1.0001f+st->prior[i]); theta = (1.f+st->post[i])*prior_ratio; if (i==1 || i==N-1) zeta1 = st->zeta[i]; else zeta1 = .25f*st->zeta[i-1] + .5f*st->zeta[i] + .25f*st->zeta[i+1]; P1 = qcurve (zeta1); /* FIXME: add global prob (P2) */ q = 1-Pframe*P1; q = 1-P1; if (q>.95f) q=.95f; p=1.f/(1.f + (q/(1.f-q))*(1.f+st->prior[i])*exp(-theta)); /*p=1;*/ /* Optimal estimator for loudness domain */ MM = hypergeom_gain(theta); st->gain[i] = prior_ratio * MM; /*Put some (very arbitraty) limit on the gain*/ if (st->gain[i]>2.f) { st->gain[i]=2.f; } st->reverb_estimate[i] = st->reverb_decay*st->reverb_estimate[i] + st->reverb_decay*st->reverb_level*st->gain[i]*st->gain[i]*st->ps[i]; if (st->denoise_enabled) { /*st->gain2[i] = p*p*st->gain[i];*/ st->gain2[i]=(p*sqrt(st->gain[i])+.2*(1-p)) * (p*sqrt(st->gain[i])+.2*(1-p)); /*st->gain2[i] = pow(st->gain[i], p) * pow(.1f,1.f-p);*/ } else { st->gain2[i]=1.f; } } st->gain2[0]=st->gain[0]=0.f; st->gain2[N-1]=st->gain[N-1]=0.f; /* for (i=30;i<N-2;i++) { st->gain[i] = st->gain2[i]*st->gain2[i] + (1-st->gain2[i])*.333*(.6*st->gain2[i-1]+st->gain2[i]+.6*st->gain2[i+1]+.4*st->gain2[i-2]+.4*st->gain2[i+2]); } for (i=30;i<N-2;i++) st->gain2[i] = st->gain[i]; */ if (st->agc_enabled) speex_compute_agc(st, mean_prior); #if 0 if (!is_speech) { for (i=0;i<N;i++) st->gain2[i] = 0; } #if 0 else { for (i=0;i<N;i++) st->gain2[i] = 1; } #endif #endif /* Apply computed gain */ for (i=1;i<N;i++) { st->frame[2*i-1] *= st->gain2[i]; st->frame[2*i] *= st->gain2[i]; } /* Get rid of the DC and very low frequencies */ st->frame[0]=0; st->frame[1]=0; st->frame[2]=0; /* Nyquist frequency is mostly useless too */ st->frame[2*N-1]=0; /* Inverse FFT with 1/N scaling */ spx_drft_backward(st->fft_lookup, st->frame); for (i=0;i<2*N;i++) st->frame[i] *= scale; { float max_sample=0; for (i=0;i<2*N;i++) if (fabs(st->frame[i])>max_sample) max_sample = fabs(st->frame[i]); if (max_sample>28000.f) { float damp = 28000.f/max_sample; for (i=0;i<2*N;i++) st->frame[i] *= damp; } } for (i=0;i<2*N;i++) st->frame[i] *= st->window[i]; /* Perform overlap and add */ for (i=0;i<N3;i++) x[i] = st->outbuf[i] + st->frame[i]; for (i=0;i<N4;i++) x[N3+i] = st->frame[N3+i]; /* Update outbuf */ for (i=0;i<N3;i++) st->outbuf[i] = st->frame[st->frame_size+i]; /* Save old power spectrum */ for (i=1;i<N;i++) st->old_ps[i] = ps[i]; return is_speech; }