void acmod_start_stream(acmod_t *acmod) { fe_start_stream(acmod->fe); acmod->utt_start_frame = 0; }
void segment_audio() { FILE *file; int16 pcm_buf[BLOCKSIZE]; mfcc_t **cep_buf; int16 voiced_buf = NULL; int32 voiced_nsamps, out_frameidx, uttstart = 0; char file_name[1024]; uint8 cur_vad_state, vad_state, writing; int uttno, uttlen, sample_rate; int32 nframes, nframes_tmp; int16 frame_size, frame_shift, frame_rate; size_t k; sample_rate = (int) cmd_ln_float32_r(config, "-samprate"); frame_rate = cmd_ln_int32_r(config, "-frate"); frame_size = (int32) (cmd_ln_float32_r(config, "-wlen") * sample_rate + 0.5); frame_shift = (int32) (sample_rate / cmd_ln_int32_r(config, "-frate") + 0.5); nframes = (BLOCKSIZE - frame_size) / frame_shift; cep_buf = (mfcc_t **) ckd_calloc_2d(nframes, fe_get_output_size(fe), sizeof(mfcc_t)); uttno = 0; uttlen = 0; cur_vad_state = 0; voiced_nsamps = 0; writing = 0; file = NULL; fe_start_stream(fe); fe_start_utt(fe); while ((k = read_audio(pcm_buf, BLOCKSIZE)) > 0) { int16 const *pcm_buf_tmp; pcm_buf_tmp = &pcm_buf[0]; while (k) { nframes_tmp = nframes; fe_process_frames_ext(fe, &pcm_buf_tmp, &k, cep_buf, &nframes_tmp, voiced_buf, &voiced_nsamps, &out_frameidx); if (out_frameidx > 0) { uttstart = out_frameidx; } vad_state = fe_get_vad_state(fe); if (!cur_vad_state && vad_state) { /* silence->speech transition, time to start new file */ uttno++; if (!singlefile) { sprintf(file_name, "%s%04d.raw", infile_path, uttno); if ((file = fopen(file_name, "wb")) == NULL) E_FATAL_SYSTEM("Failed to open '%s' for writing", file_name); } else { sprintf(file_name, "%s.raw", infile_path); if ((file = fopen(file_name, "ab")) == NULL) E_FATAL_SYSTEM("Failed to open '%s' for writing", file_name); } writing = 1; } if (writing && file && voiced_nsamps > 0) { fwrite(voiced_buf, sizeof(int16), voiced_nsamps, file); uttlen += voiced_nsamps; } if (cur_vad_state && !vad_state) { /* speech -> silence transition, time to finish file */ fclose(file); printf("Utterance %04d: file %s start %.1f sec length %d samples ( %.2f sec )\n", uttno, file_name, ((double) uttstart) / frame_rate, uttlen, ((double) uttlen) / sample_rate); fflush(stdout); fe_end_utt(fe, cep_buf[0], &nframes_tmp); writing = 0; uttlen = 0; voiced_nsamps = 0; fe_start_utt(fe); } cur_vad_state = vad_state; } } if (writing) { fclose(file); printf("Utterance %04d: file %s start %.1f sec length %d samples ( %.2f sec )\n", uttno, file_name, ((double) uttstart) / frame_rate, uttlen, ((double) uttlen) / sample_rate); fflush(stdout); } fe_end_utt(fe, cep_buf[0], &nframes); ckd_free_2d(cep_buf); }
fe_t * fe_init_auto_r(cmd_ln_t *config) { fe_t *fe; int prespch_frame_len; fe = (fe_t*)ckd_calloc(1, sizeof(*fe)); fe->refcount = 1; /* transfer params to front end */ if (fe_parse_general_params(cmd_ln_retain(config), fe) < 0) { fe_free(fe); return NULL; } /* compute remaining fe parameters */ /* We add 0.5 so approximate the float with the closest * integer. E.g., 2.3 is truncate to 2, whereas 3.7 becomes 4 */ fe->frame_shift = (int32) (fe->sampling_rate / fe->frame_rate + 0.5); fe->frame_size = (int32) (fe->window_length * fe->sampling_rate + 0.5); fe->prior = 0; fe_start_stream(fe); assert (fe->frame_shift > 1); if (fe->frame_size < fe->frame_shift) { E_ERROR ("Frame size %d (-wlen) must be greater than frame shift %d (-frate)\n", fe->frame_size, fe->frame_shift); fe_free(fe); return NULL; } if (fe->frame_size > (fe->fft_size)) { E_ERROR ("Number of FFT points has to be a power of 2 higher than %d, it is %d\n", fe->frame_size, fe->fft_size); fe_free(fe); return NULL; } if (fe->dither) fe_init_dither(fe->seed); /* establish buffers for overflow samps and hamming window */ fe->overflow_samps = ckd_calloc(fe->frame_size, sizeof(int16)); fe->hamming_window = ckd_calloc(fe->frame_size/2, sizeof(window_t)); /* create hamming window */ fe_create_hamming(fe->hamming_window, fe->frame_size); /* init and fill appropriate filter structure */ fe->mel_fb = ckd_calloc(1, sizeof(*fe->mel_fb)); /* transfer params to mel fb */ fe_parse_melfb_params(config, fe, fe->mel_fb); if (fe->mel_fb->upper_filt_freq > fe->sampling_rate / 2 + 1.0) { E_ERROR("Upper frequency %.1f is higher than samprate/2 (%.1f)\n", fe->mel_fb->upper_filt_freq, fe->sampling_rate / 2); fe_free(fe); return NULL; } fe_build_melfilters(fe->mel_fb); fe_compute_melcosine(fe->mel_fb); if (fe->remove_noise || fe->remove_silence) fe->noise_stats = fe_init_noisestats(fe->mel_fb->num_filters); fe->vad_data = (vad_data_t*)ckd_calloc(1, sizeof(*fe->vad_data)); prespch_frame_len = fe->log_spec != RAW_LOG_SPEC ? fe->num_cepstra : fe->mel_fb->num_filters; fe->vad_data->prespch_buf = fe_prespch_init(fe->pre_speech + 1, prespch_frame_len, fe->frame_shift); /* Create temporary FFT, spectrum and mel-spectrum buffers. */ /* FIXME: Gosh there are a lot of these. */ fe->spch = ckd_calloc(fe->frame_size, sizeof(*fe->spch)); fe->frame = ckd_calloc(fe->fft_size, sizeof(*fe->frame)); fe->spec = ckd_calloc(fe->fft_size, sizeof(*fe->spec)); fe->mfspec = ckd_calloc(fe->mel_fb->num_filters, sizeof(*fe->mfspec)); /* create twiddle factors */ fe->ccc = ckd_calloc(fe->fft_size / 4, sizeof(*fe->ccc)); fe->sss = ckd_calloc(fe->fft_size / 4, sizeof(*fe->sss)); fe_create_twiddle(fe); if (cmd_ln_boolean_r(config, "-verbose")) { fe_print_current(fe); } /*** Initialize the overflow buffers ***/ fe_start_utt(fe); return fe; }