void EmoVoiceVAD::transform (ITransformer::info info, ssi_stream_t &stream_in, ssi_stream_t &stream_out, ssi_size_t xtra_stream_in_num, ssi_stream_t xtra_stream_in[]) { ssi_size_t sample_number = stream_in.num; int16_t *inblock = ssi_pcast (int16_t, stream_in.ptr); int *outblock = ssi_pcast (int, stream_out.ptr); ssi_size_t steps = (sample_number - (EMOVOICEVAD_FRAME_SIZE - EMOVOICEVAD_FRAME_STEP)) / EMOVOICEVAD_FRAME_STEP; if (steps <= 0) { ssi_err ("Input vector too short (%d)", EMOVOICEVAD_FRAME_SIZE); } for (ssi_size_t i = 0; i < steps; i++) { *outblock++ = dsp_vad_calc((dsp_sample_t *)voice, (dsp_vad_t *)vad, (dsp_sample_t *)(inblock + i*EMOVOICEVAD_FRAME_STEP)); } }
int _asegment_vad(dsp_vad_t *vad, dsp_sample_t *signal, int n_samples, dsp_sample_t ***signal_segment, int **length) { int va=0, i, out_sample; int n_segments =0, last_va=0, samples=0; int max_segments=MAX_SEGMENTS, segment_length = SEGMENT_LENGTH; int delay_len=vad->sigbuf->length; int *_length; dsp_sample_t *voice; dsp_sample_t **_segments; int temp=0; dsp_sample_t *_signal; _signal = (dsp_sample_t *) rs_malloc(n_samples*sizeof(dsp_sample_t),"signal copy"); for (i=0;i<n_samples;i++) _signal[i]=signal[i]; if (!vad) { rs_warning("No voice activity detection info available!"); return -1; } voice = (dsp_sample_t *) rs_malloc(vad->frame_len * sizeof(dsp_sample_t),"Voice frame"); _segments = (dsp_sample_t **) rs_malloc(max_segments * sizeof(dsp_sample_t *),"Signal segments"); _length = (int *) rs_malloc(max_segments * sizeof(int),"Segment lengths"); for (i=0;i<=n_samples-VAD_FRAME_SHIFT || va >=0;i+=VAD_FRAME_SHIFT) { if (i > n_samples-VAD_FRAME_SHIFT) { va = dsp_vad_calc(voice,vad,NULL); } else { if (i>n_samples-vad->frame_len) { int j, new_len=i+vad->frame_len; _signal= (dsp_sample_t *) rs_realloc(_signal,new_len*sizeof(dsp_sample_t),"Signal buffer"); for (j=n_samples;j<new_len;j++) _signal[j]=0; } va = dsp_vad_calc(voice, vad, _signal+i); } if (va >=0 && ((va && !last_va) || (!va && last_va)) && samples >0) { out_sample=i-(delay_len-vad->sigbuf->need_elems)*160; if (out_sample>=n_samples) out_sample=n_samples-1; if (va) { if (n_segments >= max_segments) { max_segments += MAX_SEGMENTS; _segments = (dsp_sample_t **) rs_realloc(_segments,max_segments * sizeof(dsp_sample_t *),"Signal segments"); _length = (int *) rs_realloc(_length,max_segments * sizeof(int),"Segment lengths"); } _segments[n_segments] = (dsp_sample_t *) rs_malloc(segment_length * sizeof(dsp_sample_t),"Signal segment"); if (output) fprintf(stderr,"[%d..",out_sample); } else { if (output) fprintf(stderr,"%d] ",out_sample-1); _length[n_segments]=last_va*VAD_FRAME_SHIFT; n_segments++; segment_length=SEGMENT_LENGTH; } } if (!va) last_va=0; if (va==1 && samples>0) { if ((last_va+1)*VAD_FRAME_SHIFT > segment_length) { segment_length += SEGMENT_LENGTH; _segments[n_segments] = (dsp_sample_t *) rs_realloc(_segments[n_segments],segment_length * sizeof(dsp_sample_t),"Signal segment"); } samples+=VAD_FRAME_SHIFT; memcpy(_segments[n_segments]+last_va*VAD_FRAME_SHIFT,voice,VAD_FRAME_SHIFT*sizeof(dsp_sample_t)); last_va++; } temp=0; while (va==0) { temp++; samples+=VAD_FRAME_SHIFT; va = dsp_vad_calc(voice, vad, NULL); } } if (last_va && samples >0) { out_sample=i-(delay_len-vad->sigbuf->need_elems)*160; if (out_sample>=n_samples) out_sample=n_samples; if (output) fprintf(stderr,"%d] ",out_sample-1); _length[n_segments]=last_va*VAD_FRAME_SHIFT; n_segments++; } if (output) fprintf(stderr,";\n"); rs_free(voice); rs_free(_signal); *signal_segment=_segments; *length=_length; return n_segments; }