static void sbr_autocorrelate_c(const float x[40][2], float phi[3][2][2]) { #if 0 /* This code is slower because it multiplies memory accesses. * It is left for educational purposes and because it may offer * a better reference for writing arch-specific DSP functions. */ autocorrelate(x, phi, 0); autocorrelate(x, phi, 1); autocorrelate(x, phi, 2); #else float real_sum2 = x[0][0] * x[2][0] + x[0][1] * x[2][1]; float imag_sum2 = x[0][0] * x[2][1] - x[0][1] * x[2][0]; float real_sum1 = 0.0f, imag_sum1 = 0.0f, real_sum0 = 0.0f; int i; for (i = 1; i < 38; i++) { real_sum0 += x[i][0] * x[i ][0] + x[i][1] * x[i ][1]; real_sum1 += x[i][0] * x[i + 1][0] + x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1] - x[i][1] * x[i + 1][0]; real_sum2 += x[i][0] * x[i + 2][0] + x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1] - x[i][1] * x[i + 2][0]; } phi[2 - 2][1][0] = real_sum2; phi[2 - 2][1][1] = imag_sum2; phi[2 ][1][0] = real_sum0 + x[ 0][0] * x[ 0][0] + x[ 0][1] * x[ 0][1]; phi[1 ][0][0] = real_sum0 + x[38][0] * x[38][0] + x[38][1] * x[38][1]; phi[2 - 1][1][0] = real_sum1 + x[ 0][0] * x[ 1][0] + x[ 0][1] * x[ 1][1]; phi[2 - 1][1][1] = imag_sum1 + x[ 0][0] * x[ 1][1] - x[ 0][1] * x[ 1][0]; phi[0 ][0][0] = real_sum1 + x[38][0] * x[39][0] + x[38][1] * x[39][1]; phi[0 ][0][1] = imag_sum1 + x[38][0] * x[39][1] - x[38][1] * x[39][0]; #endif }
void find_aks_for_lsp( float Sn[], /* Nsam samples with order sample memory */ float a[], /* order+1 LPCs with first coeff 1.0 */ int Nsam, /* number of input speech samples */ int order, /* order of the LPC analysis */ float *E /* residual energy */ ) { float Wn[N]; /* windowed frame of Nsam speech samples */ float R[P+1]; /* order+1 autocorrelation values of Sn[] */ int i; hanning_window(Sn,Wn,Nsam); autocorrelate(Wn,R,Nsam,order); R[0] += LPC_FLOOR; assert(order == 10); /* lag window only defined for order == 10 */ for(i=0; i<=order; i++) R[i] *= lag_window[i]; levinson_durbin(R,a,order); *E = 0.0; for(i=0; i<=order; i++) *E += a[i]*R[i]; if (*E < 0.0) *E = 1E-12; }
void find_aks( float Sn[], /* Nsam samples with order sample memory */ float a[], /* order+1 LPCs with first coeff 1.0 */ int Nsam, /* number of input speech samples */ int order, /* order of the LPC analysis */ float *E /* residual energy */ ) { float Wn[LPC_MAX_N]; /* windowed frame of Nsam speech samples */ float R[LPC_MAX+1]; /* order+1 autocorrelation values of Sn[] */ int i; assert(order < LPC_MAX); assert(Nsam < LPC_MAX_N); hanning_window(Sn,Wn,Nsam); autocorrelate(Wn,R,Nsam,order); levinson_durbin(R,a,order); *E = 0.0; for(i=0; i<=order; i++) *E += a[i]*R[i]; if (*E < 0.0) *E = 1E-12; }
void find_aks( scalar Sn[], /* Nsam samples with order sample memory */ scalar a[], /* order+1 LPCs with first coeff 1.0 */ int Nsam, /* number of input speech samples */ int order, /* order of the LPC analysis */ scalar *E /* residual energy */ ) { scalar Wn[LPC_MAX_N]; /* windowed frame of Nsam speech samples */ scalar R[order+1]; /* order+1 autocorrelation values of Sn[] */ int i; assert(Nsam < LPC_MAX_N); hanning_window(Sn,Wn,Nsam); autocorrelate(Wn,R,Nsam,order); levinson_durbin(R,a,order); *E = fl_to_numb(0.0); for(i=0; i<=order; i++) *E = s_add(*E , s_mul(a[i],R[i])); if (*E < fl_to_numb(0.0)) { #ifdef MATH_Q16_16 *E = 1; #else *E = powf(2, -16);//fl_to_numb(1E-12); For debuging purposes. #endif } }
float speech_to_uq_lsps(float lsp[], float ak[], float Sn[], float w[], int order ) { int i, roots; float Wn[M]; float R[LPC_MAX+1]; float e, E; e = 0.0; for(i=0; i<M; i++) { Wn[i] = Sn[i]*w[i]; e += Wn[i]*Wn[i]; } /* trap 0 energy case as LPC analysis will fail */ if (e == 0.0) { for(i=0; i<order; i++) lsp[i] = (PI/order)*(float)i; return 0.0; } autocorrelate(Wn, R, M, order); levinson_durbin(R, ak, order); E = 0.0; for(i=0; i<=order; i++) E += ak[i]*R[i]; /* 15 Hz BW expansion as I can't hear the difference and it may help help occasional fails in the LSP root finding. Important to do this after energy calculation to avoid -ve energy values. */ for(i=0; i<=order; i++) ak[i] *= powf(0.994,(float)i); roots = lpc_to_lsp(ak, order, lsp, 5, LSP_DELTA1); if (roots != order) { /* if root finding fails use some benign LSP values instead */ for(i=0; i<order; i++) lsp[i] = (PI/order)*(float)i; } return E; }
void Histogram::compute() { counts.resize( ceil( bin_size ) + 1 ); std::fill( counts.begin(), counts.end(), 0 ); Bins::Position low_border = Bins::Position::Constant( 0 * camera::pixel ), high_border = bins.sizes().array() - 1 * camera::pixel; for ( Bins::iterator i = bins.begin(); i != bins.end(); ++i ) { if ( (i.position() == low_border).any() || (i.position() == high_border).any() ) continue; autocorrelate( *i ); for (ForwardScan::iterator j = forward_scan.begin(); j != forward_scan.end(); ++j) { crosscorrelate( *i, bins( i.position() + *j ) ); } } if ( periodic_boundary ) counts.pop_back(); }
float speech_to_uq_lsps(float lsp[], float ak[], float Sn[], float w[], int order ) { int i, roots; float Wn[M]; float R[LPC_MAX+1]; float E; for(i=0; i<M; i++) Wn[i] = Sn[i]*w[i]; autocorrelate(Wn, R, M, order); levinson_durbin(R, ak, order); E = 0.0; for(i=0; i<=order; i++) E += ak[i]*R[i]; roots = lpc_to_lsp(ak, order, lsp, 5, LSP_DELTA1); if (roots != order) { /* for some reason LSP roots could not be found */ /* some alpha testers are reporting this condition */ fprintf(stderr, "LSP roots not found!\nroots = %d\n", roots); for(i=0; i<=order; i++) fprintf(stderr, "a[%d] = %f\n", i, ak[i]); /* some benign LSP values we can use instead */ for(i=0; i<order; i++) lsp[i] = (PI/order)*(float)i; } return E; }
static void sbr_autocorrelate_c(const float x[40][2], float phi[3][2][2]) { autocorrelate(x, phi, 0); autocorrelate(x, phi, 1); autocorrelate(x, phi, 2); }
int main(int argc, char *argv[]) { FILE *fout = NULL; /* output speech file */ FILE *fin; /* input speech file */ short buf[N]; /* input/output buffer */ float buf_float[N]; float Sn[M]; /* float input speech samples */ float Sn_pre[N]; /* pre-emphasised input speech samples */ COMP Sw[FFT_ENC]; /* DFT of Sn[] */ kiss_fft_cfg fft_fwd_cfg; kiss_fft_cfg fft_inv_cfg; float w[M]; /* time domain hamming window */ COMP W[FFT_ENC]; /* DFT of w[] */ MODEL model; float Pn[2*N]; /* trapezoidal synthesis window */ float Sn_[2*N]; /* synthesised speech */ int i,m; /* loop variable */ int frames; float prev_Wo, prev__Wo, prev_uq_Wo; float pitch; char out_file[MAX_STR]; char ampexp_arg[MAX_STR]; char phaseexp_arg[MAX_STR]; float snr; float sum_snr; int orderi; int lpc_model = 0, order = LPC_ORD; int lsp = 0, lspd = 0, lspvq = 0; int lspres = 0; int lspjvm = 0, lspjnd = 0, lspmel = 0; #ifdef __EXPERIMENTAL__ int lspanssi = 0, #endif int prede = 0; float pre_mem = 0.0, de_mem = 0.0; float ak[order]; COMP Sw_[FFT_ENC]; COMP Ew[FFT_ENC]; int phase0 = 0; float ex_phase[MAX_AMP+1]; int postfilt; int hand_voicing = 0, phaseexp = 0, ampexp = 0, hi = 0, simlpcpf = 0; int lpcpf = 0; FILE *fvoicing = 0; MODEL prev_model; int dec; int decimate = 1; float lsps[order]; float e, prev_e; int lsp_indexes[order]; float lsps_[order]; float Woe_[2]; float lsps_dec[4][LPC_ORD], e_dec[4], weight, weight_inc, ak_dec[4][LPC_ORD]; MODEL model_dec[4], prev_model_dec; float prev_lsps_dec[order], prev_e_dec; void *nlp_states; float hpf_states[2]; int scalar_quant_Wo_e = 0; int vector_quant_Wo_e = 0; int dump_pitch_e = 0; FILE *fjvm = NULL; #ifdef DUMP int dump; #endif struct PEXP *pexp = NULL; struct AEXP *aexp = NULL; float gain = 1.0; int bpf_en = 0; float bpf_buf[BPF_N+N]; char* opt_string = "ho:"; struct option long_options[] = { { "lpc", required_argument, &lpc_model, 1 }, { "lspjnd", no_argument, &lspjnd, 1 }, { "lspmel", no_argument, &lspmel, 1 }, { "lsp", no_argument, &lsp, 1 }, { "lspd", no_argument, &lspd, 1 }, { "lspvq", no_argument, &lspvq, 1 }, { "lspres", no_argument, &lspres, 1 }, { "lspjvm", no_argument, &lspjvm, 1 }, #ifdef __EXPERIMENTAL__ { "lspanssi", no_argument, &lspanssi, 1 }, #endif { "phase0", no_argument, &phase0, 1 }, { "phaseexp", required_argument, &phaseexp, 1 }, { "ampexp", required_argument, &exp, 1 }, { "postfilter", no_argument, &postfilt, 1 }, { "hand_voicing", required_argument, &hand_voicing, 1 }, { "dec", required_argument, &dec, 1 }, { "hi", no_argument, &hi, 1 }, { "simlpcpf", no_argument, &simlpcpf, 1 }, { "lpcpf", no_argument, &lpcpf, 1 }, { "prede", no_argument, &prede, 1 }, { "dump_pitch_e", required_argument, &dump_pitch_e, 1 }, { "sq_pitch_e", no_argument, &scalar_quant_Wo_e, 1 }, { "vq_pitch_e", no_argument, &vector_quant_Wo_e, 1 }, { "rate", required_argument, NULL, 0 }, { "gain", required_argument, NULL, 0 }, { "bpf", no_argument, &bpf_en, 1 }, #ifdef DUMP { "dump", required_argument, &dump, 1 }, #endif { "help", no_argument, NULL, 'h' }, { NULL, no_argument, NULL, 0 } }; int num_opts=sizeof(long_options)/sizeof(struct option); COMP Aw[FFT_ENC]; for(i=0; i<M; i++) { Sn[i] = 1.0; Sn_pre[i] = 1.0; } for(i=0; i<2*N; i++) Sn_[i] = 0; prev_uq_Wo = prev_Wo = prev__Wo = TWO_PI/P_MAX; prev_model.Wo = TWO_PI/P_MIN; prev_model.L = floor(PI/prev_model.Wo); for(i=1; i<=prev_model.L; i++) { prev_model.A[i] = 0.0; prev_model.phi[i] = 0.0; } for(i=1; i<=MAX_AMP; i++) { //ex_phase[i] = (PI/3)*(float)rand()/RAND_MAX; ex_phase[i] = 0.0; } e = prev_e = 1; hpf_states[0] = hpf_states[1] = 0.0; nlp_states = nlp_create(M); if (argc < 2) { print_help(long_options, num_opts, argv); } /*----------------------------------------------------------------*\ Interpret Command Line Arguments \*----------------------------------------------------------------*/ while(1) { int option_index = 0; int opt = getopt_long(argc, argv, opt_string, long_options, &option_index); if (opt == -1) break; switch (opt) { case 0: if(strcmp(long_options[option_index].name, "lpc") == 0) { orderi = atoi(optarg); if((orderi < 4) || (orderi > order)) { fprintf(stderr, "Error in LPC order (4 to %d): %s\n", order, optarg); exit(1); } order = orderi; #ifdef DUMP } else if(strcmp(long_options[option_index].name, "dump") == 0) { if (dump) dump_on(optarg); #endif } else if(strcmp(long_options[option_index].name, "lsp") == 0 || strcmp(long_options[option_index].name, "lspd") == 0 || strcmp(long_options[option_index].name, "lspvq") == 0) { assert(order == LPC_ORD); } else if(strcmp(long_options[option_index].name, "dec") == 0) { decimate = atoi(optarg); if ((decimate != 2) && (decimate != 4)) { fprintf(stderr, "Error in --dec, must be 2 or 4\n"); exit(1); } if (!phase0) { printf("needs --phase0 to resample phase when using --dec\n"); exit(1); } if (!lpc_model) { printf("needs --lpc [order] to resample amplitudes when using --dec\n"); exit(1); } } else if(strcmp(long_options[option_index].name, "hand_voicing") == 0) { if ((fvoicing = fopen(optarg,"rt")) == NULL) { fprintf(stderr, "Error opening voicing file: %s: %s.\n", optarg, strerror(errno)); exit(1); } } else if(strcmp(long_options[option_index].name, "dump_pitch_e") == 0) { if ((fjvm = fopen(optarg,"wt")) == NULL) { fprintf(stderr, "Error opening pitch & energy dump file: %s: %s.\n", optarg, strerror(errno)); exit(1); } } else if(strcmp(long_options[option_index].name, "phaseexp") == 0) { strcpy(phaseexp_arg, optarg); } else if(strcmp(long_options[option_index].name, "ampexp") == 0) { strcpy(ampexp_arg, optarg); } else if(strcmp(long_options[option_index].name, "gain") == 0) { gain = atof(optarg); } else if(strcmp(long_options[option_index].name, "rate") == 0) { if(strcmp(optarg,"3200") == 0) { lpc_model = 1; scalar_quant_Wo_e = 1; lspd = 1; phase0 = 1; postfilt = 1; decimate = 1; lpcpf = 1; } else if(strcmp(optarg,"2400") == 0) { lpc_model = 1; vector_quant_Wo_e = 1; lsp = 1; phase0 = 1; postfilt = 1; decimate = 2; lpcpf = 1; } else if(strcmp(optarg,"1400") == 0) { lpc_model = 1; vector_quant_Wo_e = 1; lsp = 1; phase0 = 1; postfilt = 1; decimate = 4; lpcpf = 1; } else if(strcmp(optarg,"1300") == 0) { lpc_model = 1; scalar_quant_Wo_e = 1; lsp = 1; phase0 = 1; postfilt = 1; decimate = 4; lpcpf = 1; } else if(strcmp(optarg,"1200") == 0) { lpc_model = 1; scalar_quant_Wo_e = 1; lspjvm = 1; phase0 = 1; postfilt = 1; decimate = 4; lpcpf = 1; } else { fprintf(stderr, "Error: invalid output rate (3200|2400|1400|1200) %s\n", optarg); exit(1); } } break; case 'h': print_help(long_options, num_opts, argv); break; case 'o': if (strcmp(optarg, "-") == 0) fout = stdout; else if ((fout = fopen(optarg,"wb")) == NULL) { fprintf(stderr, "Error opening output speech file: %s: %s.\n", optarg, strerror(errno)); exit(1); } strcpy(out_file,optarg); break; default: /* This will never be reached */ break; } } /* Input file */ if (strcmp(argv[optind], "-") == 0) fin = stdin; else if ((fin = fopen(argv[optind],"rb")) == NULL) { fprintf(stderr, "Error opening input speech file: %s: %s.\n", argv[optind], strerror(errno)); exit(1); } ex_phase[0] = 0; Woe_[0] = Woe_[1] = 1.0; /* printf("lspd: %d lspdt: %d lspdt_mode: %d phase0: %d postfilt: %d " "decimate: %d dt: %d\n",lspd,lspdt,lspdt_mode,phase0,postfilt, decimate,dt); */ /* Initialise ------------------------------------------------------------*/ fft_fwd_cfg = kiss_fft_alloc(FFT_ENC, 0, NULL, NULL); /* fwd FFT,used in several places */ fft_inv_cfg = kiss_fft_alloc(FFT_DEC, 1, NULL, NULL); /* inverse FFT, used just for synth */ make_analysis_window(fft_fwd_cfg, w, W); make_synthesis_window(Pn); quantise_init(); if (phaseexp) pexp = phase_experiment_create(); if (ampexp) aexp = amp_experiment_create(); if (bpf_en) { for(i=0; i<BPF_N; i++) bpf_buf[i] = 0.0; } for(i=0; i<LPC_ORD; i++) { prev_lsps_dec[i] = i*PI/(LPC_ORD+1); } prev_e_dec = 1; for(m=1; m<=MAX_AMP; m++) prev_model_dec.A[m] = 0.0; prev_model_dec.Wo = TWO_PI/P_MAX; prev_model_dec.L = PI/prev_model_dec.Wo; prev_model_dec.voiced = 0; /*----------------------------------------------------------------* \ Main Loop \*----------------------------------------------------------------*/ frames = 0; sum_snr = 0; while(fread(buf,sizeof(short),N,fin)) { frames++; for(i=0; i<N; i++) buf_float[i] = buf[i]; /* optionally filter input speech */ if (prede) { pre_emp(Sn_pre, buf_float, &pre_mem, N); for(i=0; i<N; i++) buf_float[i] = Sn_pre[i]; } if (bpf_en) { for(i=0; i<BPF_N; i++) bpf_buf[i] = bpf_buf[N+i]; for(i=0; i<N; i++) bpf_buf[BPF_N+i] = buf_float[i]; inverse_filter(&bpf_buf[BPF_N], bpf, N, buf_float, BPF_N); } /* shift buffer of input samples, and insert new samples */ for(i=0; i<M-N; i++) { Sn[i] = Sn[i+N]; } for(i=0; i<N; i++) Sn[i+M-N] = buf_float[i]; /*------------------------------------------------------------*\ Estimate Sinusoidal Model Parameters \*------------------------------------------------------------*/ nlp(nlp_states,Sn,N,P_MIN,P_MAX,&pitch,Sw,W,&prev_uq_Wo); model.Wo = TWO_PI/pitch; dft_speech(fft_fwd_cfg, Sw, Sn, w); two_stage_pitch_refinement(&model, Sw); estimate_amplitudes(&model, Sw, W, 1); #ifdef DUMP dump_Sn(Sn); dump_Sw(Sw); dump_model(&model); #endif if (ampexp) amp_experiment(aexp, &model, ampexp_arg); if (phaseexp) { #ifdef DUMP dump_phase(&model.phi[0], model.L); #endif phase_experiment(pexp, &model, phaseexp_arg); #ifdef DUMP dump_phase_(&model.phi[0], model.L); #endif } if (hi) { int m; for(m=1; m<model.L/2; m++) model.A[m] = 0.0; for(m=3*model.L/4; m<=model.L; m++) model.A[m] = 0.0; } /*------------------------------------------------------------*\ Zero-phase modelling \*------------------------------------------------------------*/ if (phase0) { float Wn[M]; /* windowed speech samples */ float Rk[order+1]; /* autocorrelation coeffs */ COMP a[FFT_ENC]; #ifdef DUMP dump_phase(&model.phi[0], model.L); #endif /* find aks here, these are overwritten if LPC modelling is enabled */ for(i=0; i<M; i++) Wn[i] = Sn[i]*w[i]; autocorrelate(Wn,Rk,M,order); levinson_durbin(Rk,ak,order); /* determine voicing */ snr = est_voicing_mbe(&model, Sw, W, Sw_, Ew); if (dump_pitch_e) fprintf(fjvm, "%f %f %d ", model.Wo, snr, model.voiced); //printf("snr %3.2f v: %d Wo: %f prev_Wo: %f\n", snr, model.voiced, // model.Wo, prev_uq_Wo); #ifdef DUMP dump_Sw_(Sw_); dump_Ew(Ew); dump_snr(snr); #endif /* just to make sure we are not cheating - kill all phases */ for(i=0; i<=MAX_AMP; i++) model.phi[i] = 0; /* Determine DFT of A(exp(jw)), which is needed for phase0 model when LPC is not used, e.g. indecimate=1 (10ms) frames with no LPC */ for(i=0; i<FFT_ENC; i++) { a[i].real = 0.0; a[i].imag = 0.0; } for(i=0; i<=order; i++) a[i].real = ak[i]; kiss_fft(fft_fwd_cfg, (kiss_fft_cpx *)a, (kiss_fft_cpx *)Aw); if (hand_voicing) { fscanf(fvoicing,"%d\n",&model.voiced); } } /*------------------------------------------------------------*\ LPC model amplitudes and LSP quantisation \*------------------------------------------------------------*/ if (lpc_model) { e = speech_to_uq_lsps(lsps, ak, Sn, w, order); for(i=0; i<LPC_ORD; i++) lsps_[i] = lsps[i]; #ifdef DUMP dump_ak(ak, order); dump_E(e); #endif /* tracking down -ve energy values with BW expansion */ /* if (e < 0.0) { int i; FILE*f=fopen("x.txt","wt"); for(i=0; i<M; i++) fprintf(f,"%f\n", Sn[i]); fclose(f); printf("e = %f frames = %d\n", e, frames); for(i=0; i<order; i++) printf("%f ", ak[i]); exit(0); } */ if (dump_pitch_e) fprintf(fjvm, "%f\n", e); #ifdef DUMP dump_lsp(lsps); #endif /* various LSP quantisation schemes */ if (lsp) { encode_lsps_scalar(lsp_indexes, lsps, LPC_ORD); decode_lsps_scalar(lsps_, lsp_indexes, LPC_ORD); bw_expand_lsps(lsps_, LPC_ORD, 50.0, 100.0); lsp_to_lpc(lsps_, ak, LPC_ORD); } if (lspd) { encode_lspds_scalar(lsp_indexes, lsps, LPC_ORD); decode_lspds_scalar(lsps_, lsp_indexes, LPC_ORD); lsp_to_lpc(lsps_, ak, LPC_ORD); } #ifdef __EXPERIMENTAL__ if (lspvq) { lspvq_quantise(lsps, lsps_, LPC_ORD); bw_expand_lsps(lsps_, LPC_ORD, 50.0, 100.0); lsp_to_lpc(lsps_, ak, LPC_ORD); } #endif if (lspjvm) { /* Jean-Marc's multi-stage, split VQ */ lspjvm_quantise(lsps, lsps_, LPC_ORD); { float lsps_bw[LPC_ORD]; memcpy(lsps_bw, lsps_, sizeof(float)*LPC_ORD); bw_expand_lsps(lsps_bw, LPC_ORD, 50.0, 100.0); lsp_to_lpc(lsps_bw, ak, LPC_ORD); } } #ifdef __EXPERIMENTAL__ if (lspanssi) { /* multi-stage VQ from Anssi Ramo OH3GDD */ lspanssi_quantise(lsps, lsps_, LPC_ORD, 5); bw_expand_lsps(lsps_, LPC_ORD, 50.0, 100.0); lsp_to_lpc(lsps_, ak, LPC_ORD); } #endif /* experimenting with non-linear LSP spacing to see if it's just noticable */ if (lspjnd) { for(i=0; i<LPC_ORD; i++) lsps_[i] = lsps[i]; locate_lsps_jnd_steps(lsps_, LPC_ORD); lsp_to_lpc(lsps_, ak, LPC_ORD); } /* Another experiment with non-linear LSP spacing, this time using a scaled version of mel frequency axis warping. The scaling is such that the integer output can be directly sent over the channel. */ if (lspmel) { float f, f_; float mel[LPC_ORD]; int mel_indexes[LPC_ORD]; for(i=0; i<order; i++) { f = (4000.0/PI)*lsps[i]; mel[i] = floor(2595.0*log10(1.0 + f/700.0) + 0.5); } for(i=1; i<order; i++) { if (mel[i] == mel[i-1]) mel[i]++; } encode_mels_scalar(mel_indexes, mel, 6); decode_mels_scalar(mel, mel_indexes, 6); #ifdef DUMP dump_mel(mel, order); #endif for(i=0; i<LPC_ORD; i++) { f_ = 700.0*( pow(10.0, (float)mel[i]/2595.0) - 1.0); lsps_[i] = f_*(PI/4000.0); } lsp_to_lpc(lsps_, ak, order); } if (scalar_quant_Wo_e) { e = decode_energy(encode_energy(e, E_BITS), E_BITS); model.Wo = decode_Wo(encode_Wo(model.Wo, WO_BITS), WO_BITS); model.L = PI/model.Wo; /* if we quantise Wo re-compute L */ } if (vector_quant_Wo_e) { /* JVM's experimental joint Wo & LPC energy quantiser */ quantise_WoE(&model, &e, Woe_); } } /*------------------------------------------------------------*\ Synthesise and optional decimation to 20 or 40ms frame rate \*------------------------------------------------------------*/ /* if decimate == 2, we interpolate frame n from frame n-1 and n+1 if decimate == 4, we interpolate frames n, n+1, n+2, from frames n-1 and n+3 This is meant to give identical results to the implementations of various modes in codec2.c */ /* delay line to keep frame by frame voicing decisions */ for(i=0; i<decimate-1; i++) model_dec[i] = model_dec[i+1]; model_dec[decimate-1] = model; if ((frames % decimate) == 0) { for(i=0; i<order; i++) lsps_dec[decimate-1][i] = lsps_[i]; e_dec[decimate-1] = e; model_dec[decimate-1] = model; /* interpolate the model parameters */ weight_inc = 1.0/decimate; for(i=0, weight=weight_inc; i<decimate-1; i++, weight += weight_inc) { //model_dec[i].voiced = model_dec[decimate-1].voiced; interpolate_lsp_ver2(&lsps_dec[i][0], prev_lsps_dec, &lsps_dec[decimate-1][0], weight, order); interp_Wo2(&model_dec[i], &prev_model_dec, &model_dec[decimate-1], weight); e_dec[i] = interp_energy2(prev_e_dec, e_dec[decimate-1],weight); } /* then recover spectral amplitudes and synthesise */ for(i=0; i<decimate; i++) { if (lpc_model) { lsp_to_lpc(&lsps_dec[i][0], &ak_dec[i][0], order); aks_to_M2(fft_fwd_cfg, &ak_dec[i][0], order, &model_dec[i], e_dec[i], &snr, 0, simlpcpf, lpcpf, 1, LPCPF_BETA, LPCPF_GAMMA, Aw); apply_lpc_correction(&model_dec[i]); #ifdef DUMP dump_lsp_(&lsps_dec[i][0]); dump_ak_(&ak_dec[i][0], order); sum_snr += snr; dump_quantised_model(&model_dec[i]); #endif } if (phase0) phase_synth_zero_order(fft_fwd_cfg, &model_dec[i], ex_phase, Aw); synth_one_frame(fft_inv_cfg, buf, &model_dec[i], Sn_, Pn, prede, &de_mem, gain); if (fout != NULL) fwrite(buf,sizeof(short),N,fout); } /* for(i=0; i<decimate; i++) { printf("%d Wo: %f L: %d v: %d\n", frames, model_dec[i].Wo, model_dec[i].L, model_dec[i].voiced); } if (frames == 4*50) exit(0); */ /* update memories for next frame ----------------------------*/ prev_model_dec = model_dec[decimate-1]; prev_e_dec = e_dec[decimate-1]; for(i=0; i<LPC_ORD; i++) prev_lsps_dec[i] = lsps_dec[decimate-1][i]; } } /*----------------------------------------------------------------*\ End Main Loop \*----------------------------------------------------------------*/ fclose(fin); if (fout != NULL) fclose(fout); if (lpc_model) printf("SNR av = %5.2f dB\n", sum_snr/frames); if (phaseexp) phase_experiment_destroy(pexp); if (ampexp) amp_experiment_destroy(aexp); #ifdef DUMP if (dump) dump_off(); #endif if (hand_voicing) fclose(fvoicing); nlp_destroy(nlp_states); return 0; }
float lpc_model_amplitudes( float Sn[], /* Input frame of speech samples */ float w[], MODEL *model, /* sinusoidal model parameters */ int order, /* LPC model order */ int lsp_quant, /* optional LSP quantisation if non-zero */ float ak[] /* output aks */ ) { float Wn[M]; float R[LPC_MAX+1]; float E; int i,j; float snr; float lsp[LPC_MAX]; float lsp_hz[LPC_MAX]; float lsp_[LPC_MAX]; int roots; /* number of LSP roots found */ int index; float se; int k,m; const float * cb; float wt[LPC_MAX]; for(i=0; i<M; i++) Wn[i] = Sn[i]*w[i]; autocorrelate(Wn,R,M,order); levinson_durbin(R,ak,order); E = 0.0; for(i=0; i<=order; i++) E += ak[i]*R[i]; for(i=0; i<order; i++) wt[i] = 1.0; if (lsp_quant) { roots = lpc_to_lsp(ak, order, lsp, 5, LSP_DELTA1); if (roots != order) printf("LSP roots not found\n"); /* convert from radians to Hz to make quantisers more human readable */ for(i=0; i<order; i++) lsp_hz[i] = (4000.0/PI)*lsp[i]; /* simple uniform scalar quantisers */ for(i=0; i<10; i++) { k = lsp_cb[i].k; m = lsp_cb[i].m; cb = lsp_cb[i].cb; index = quantise(cb, &lsp_hz[i], wt, k, m, &se); lsp_hz[i] = cb[index*k]; } /* experiment: simulating uniform quantisation error for(i=0; i<order; i++) lsp[i] += PI*(12.5/4000.0)*(1.0 - 2.0*(float)rand()/RAND_MAX); */ for(i=0; i<order; i++) lsp[i] = (PI/4000.0)*lsp_hz[i]; /* Bandwidth Expansion (BW). Prevents any two LSPs getting too close together after quantisation. We know from experiment that LSP quantisation errors < 12.5Hz (25Hz setp size) are inaudible so we use that as the minimum LSP separation. */ for(i=1; i<5; i++) { if (lsp[i] - lsp[i-1] < PI*(12.5/4000.0)) lsp[i] = lsp[i-1] + PI*(12.5/4000.0); } /* as quantiser gaps increased, larger BW expansion was required to prevent twinkly noises */ for(i=5; i<8; i++) { if (lsp[i] - lsp[i-1] < PI*(25.0/4000.0)) lsp[i] = lsp[i-1] + PI*(25.0/4000.0); } for(i=8; i<order; i++) { if (lsp[i] - lsp[i-1] < PI*(75.0/4000.0)) lsp[i] = lsp[i-1] + PI*(75.0/4000.0); } for(j=0; j<order; j++) lsp_[j] = lsp[j]; lsp_to_lpc(lsp_, ak, order); #ifdef DUMP dump_lsp(lsp); #endif } #ifdef DUMP dump_E(E); #endif #ifdef SIM_QUANT /* simulated LPC energy quantisation */ { float e = 10.0*log10(E); e += 2.0*(1.0 - 2.0*(float)rand()/RAND_MAX); E = pow(10.0,e/10.0); } #endif aks_to_M2(ak,order,model,E,&snr, 1); /* {ak} -> {Am} LPC decode */ return snr; }
double PitchDetector::detectAcfPitchForBlock (float* samples, int numSamples) { const int minSample = int (sampleRate / maxFrequency); const int maxSample = int (sampleRate / minFrequency); lowFilter.reset(); highFilter.reset(); lowFilter.processSamples (samples, numSamples); highFilter.processSamples (samples, numSamples); autocorrelate (samples, numSamples, buffer1.getData()); normalise (buffer1.getData(), buffer1.getSize()); // float max = 0.0f; // int sampleIndex = 0; // for (int i = minSample; i < maxSample; ++i) // { // const float sample = buffer1.getData()[i]; // if (sample > max) // { // max = sample; // sampleIndex = i; // } // } float* bufferData = buffer1.getData(); // const int bufferSize = buffer1.getSize(); int firstNegativeZero = 0; // first peak method for (int i = 0; i < numSamples - 1; ++i) { if (bufferData[i] >= 0.0f && bufferData[i + 1] < 0.0f) { firstNegativeZero = i; break; } } // apply gain ramp // float rampDelta = 1.0f / numSamples; // float rampLevel = 1.0f; // for (int i = 0; i < numSamples - 1; ++i) // { // bufferData[i] *= cubeNumber (rampLevel); // rampLevel -= rampDelta; // } float max = -1.0f; int sampleIndex = 0; for (int i = jmax (firstNegativeZero, minSample); i < maxSample; ++i) { if (bufferData[i] > max) { max = bufferData[i]; sampleIndex = i; } } // buffer2.setSizeQuick (numSamples); /* autocorrelate (buffer1.getData(), buffer1.getSize(), buffer2.getData()); normalise (buffer2.getData(), buffer2.getSize());*/ //buffer2.quickCopy (buffer1.getData(), buffer1.getSize()); // differentiate (buffer1.getData(), buffer1.getSize(), buffer2.getData()); // normalise (buffer2.getData()+2, buffer2.getSize()-2); // differentiate (buffer2.getData(), buffer2.getSize(), buffer2.getData()); /* for (int i = minSample + 1; i < maxSample - 1; ++i) { const float previousSample = buffer2.getData()[i - 1]; const float sample = buffer2.getData()[i]; const float nextSample = buffer2.getData()[i + 1]; if (sample > previousSample && sample > nextSample && sample > 0.5f) sampleIndex = i; }*/ //differentiate (buffer2.getData(), buffer2.getSize(), buffer2.getData()); //normalise (buffer2.getData() + minSample, buffer2.getSize() - minSample); // float min = 0.0f; // int sampleIndex = 0; // for (int i = minSample; i < maxSample; ++i) // { // const float sample = buffer2.getData()[i]; // if (sample < min) // { // min = sample; // sampleIndex = i; // } // } if (sampleIndex > 0) return sampleRate / sampleIndex; else return 0.0; }
main(int argc, char *argv[]) { FILE *fp; int fd,arg; snd_pcm_t *handle; snd_pcm_hw_params_t *hw_params; int rate=8000; float f[WINDOW],hann[WINDOW],w[WINDOW],w2[WINDOW],s0,s1=0,tot; float ac[ORDER+1],lcp[ORDER+1],lsp[ORDER],l[ORDER],weight[ORDER],delta,d; short sample,s[160],buf[2000]; int i,j,n,b,toggle=1; float e,laste=0; int ebit=ETOPBIT, ebuff=0; int sound=0; // boolean start/stop float f2[FFT],min; float real[FFT],imag[FFT]; int dummy[100000]; float amp[WINDOW],pha[WINDOW]; int frame=0; SpeexPreprocessState *st; for (i=0; i<8; i++) report[i]=0; st = speex_preprocess_state_init(160, 8000); i=1; speex_preprocess_ctl(st, SPEEX_PREPROCESS_SET_DENOISE, &i); // i=1; // speex_preprocess_ctl(st, SPEEX_PREPROCESS_SET_DEREVERB, &i); // e=.0; // speex_preprocess_ctl(st, SPEEX_PREPROCESS_SET_DEREVERB_DECAY, &e); // e=.0; // speex_preprocess_ctl(st, SPEEX_PREPROCESS_SET_DEREVERB_LEVEL, &e); setup_twiddles(realtwiddle,imtwiddle,FFT); for (i=0; i<WINDOW; i++) f[i]=0; for (i=0; i<ORDER; i++) {last[i]=0; last2[i]=0;} for(i=0; i<WINDOW; i++) hann[i]=0.5-0.5*cos(2.0*M_PI*i/(WINDOW-1)); if (argc==2) training=atoi(argv[1]); fprintf(stderr,"training=%i\n",training); // exit(0); cbsize=0; start[0]=0; size[0]=0; if (training==0) if (fp=fopen("cb.txt","r")) { while(!feof(fp)) { fscanf(fp,"%i\n",&size[cbsize]); for (i=start[cbsize]; i<start[cbsize]+size[cbsize]; i++) { for (n=1; n<FF2-1; n++) fscanf(fp,"%f,",&cb[i][n]); fscanf(fp,"\n"); } start[cbsize+1]=start[cbsize]+size[cbsize]; cbsize++; } fclose(fp); } //for (i=0; i<cbsize; i++) printf("%i,\n",size[i]); exit(0); //--------------------------------- fp=fopen("/tmp/b.raw","w"); snd_pcm_open(&handle, "default", SND_PCM_STREAM_CAPTURE, 0); snd_pcm_hw_params_malloc(&hw_params); snd_pcm_hw_params_any(handle, hw_params); snd_pcm_hw_params_set_access(handle, hw_params, SND_PCM_ACCESS_RW_INTERLEAVED); snd_pcm_hw_params_set_format(handle, hw_params, SND_PCM_FORMAT_S16_LE); snd_pcm_hw_params_set_rate_near(handle, hw_params, &rate, 0); snd_pcm_hw_params_set_channels(handle, hw_params, 2); snd_pcm_hw_params(handle, hw_params); snd_pcm_hw_params_free(hw_params); snd_pcm_prepare(handle); //printf("sleep 1...\n"); sleep(1); printf("OK, go....\n"); while(1) { for (i=0; i<WINDOW-STEP; i++) f[i]=f[i+STEP]; // shift samples down if (toggle) { //read(fd,s,160*2); snd_pcm_readi(handle, buf, 160); for (i=0; i<160; i++) s[i]=buf[i*2]; speex_preprocess_run(st, s); } else bcopy(&s[80],s,80*2); toggle=!toggle; for (i=WINDOW-STEP,j=0; i<WINDOW; i++,j++) { sample=s[j]; s0=(float)sample; f[i]=s0-s1*EMPH; s1=s0; // 1.0 pre-emphasis fwrite(&sample,2,1,fp); } for (i=0; i<WINDOW; i++) w[i]=f[i]; // remove any DC level.... tot=0; for (i=0; i<WINDOW; i++) tot+=w[i]; tot/=WINDOW; for (i=0; i<WINDOW; i++) w[i]-=tot; for (i=0; i<WINDOW; i++) w[i]*=hann[i]; // window data autocorrelate(w,ac,WINDOW,ORDER); wld(&lpc[1],ac,ORDER); lpc[0]=1.0; // e=ac[0]; e=0;for(i=0; i<=ORDER; i++) e+=ac[i]*lpc[i]; if (e<0) e=0; if (e>TOL_OFF) ebuff|=ebit; else ebuff&=~ebit; // update energy bit-buffer ebit>>=1; if (ebit==0) ebit=ETOPBIT; // circular shift for (i=0; i<FFT; i++) {real[i]=0; imag[i]=0;} for (i=0; i<=ORDER; i++) real[i]=lpc[i]; simple_fft(real,imag,realtwiddle,imtwiddle,FFT); for (i=0; i<FF2; i++) { b=bin[i]; f2[i]=powf(real[b]*real[b]+imag[b]*imag[b],-0.5); //f2[i]=powf(f2[i],0.333333); //f2[i]=powf(f2[i],1.2); f2[i]=logf(f2[i]); } // spectral tilt compensation... for (i=1; i<FF2; i++) f2[i]=f2[i]*(float)(i+TILT)/TILT; // fold down to 9 bins... /* if (f2[FF2-2]>f2[FF2-3]) f2[FF2-3]=f2[FF2-2]; f2[FF2-2]=0; if (f2[FF2-4]>f2[FF2-5]) f2[FF2-5]=f2[FF2-4]; f2[FF2-4]=0; if (f2[FF2-9]>f2[FF2-10]) f2[FF2-10]=f2[FF2-9]; f2[FF2-9]=0; if (f2[FF2-11]>f2[FF2-12]) f2[FF2-12]=f2[FF2-11]; f2[FF2-11]=0; if (f2[FF2-13]>f2[FF2-14]) f2[FF2-14]=f2[FF2-13]; f2[FF2-13]=0; if (f2[FF2-15]>f2[FF2-16]) f2[FF2-16]=f2[FF2-15]; f2[FF2-15]=0; */ for (i=0; i<FF2; i++) { if (f2[i]>6.0) f2[i]=6.0; f2[i]*=100.0; } if (TRACE) { fprintf(stderr,"%.f,",e); for (i=1; i<FF2-1; i++) fprintf(stderr,"%.f,",f2[i]); fprintf(stderr,"\n");} // calculate frame delta.... delta=0; for (i=1; i<FF2-1; i++) {d=f2[i]-last[i]; delta+=d*d;} //printf("delta=%f\n",delta); if (sound==0 && e>TOL_ON && frame>200) { // start recording... bcopy(last2,&word[wsize],FF2*4); wsize++; bcopy(last,&word[wsize],FF2*4); wsize++; sound=1; wsize=0; bcopy(f2,&word[wsize],FF2*4); wsize++; bcopy(last,last2,FF2*4); bcopy(f2,last,FF2*4); } else if (sound==1 && e>TOL_OFF) { // continue reading word... bcopy(f2,&word[wsize],FF2*4); wsize++; if (wsize>200) wsize=200; bcopy(last,last2,FF2*4); bcopy(f2,last,FF2*4); } else if (sound==1 && ebuff==0) { // finised reading word // wsize-=8; // remove training silence (2 frame buffer) if (wsize>4 && wsize<50) { if (training>0) train(); else closest(); } sound=0; wsize=0; bcopy(last,last2,FF2*4); bcopy(f2,last,FF2*4); } //for (i=1; i<FF2-1; i++) printf("%.0f,",f2[i]); printf(" e=%f\n",e); laste=e; frame++; if (frame==37800) exit(0); } }