void vtln_alpha(Recog *recog, RecogProcess *r) { Sentence *s; float alpha, alpha_bgn, alpha_end; float max_alpha; LOGPROB max_score; PROCESS_AM *am; MFCCCalc *mfcc; SentenceAlign *align; s = &(r->result.sent[0]); align = result_align_new(); max_score = LOG_ZERO; printf("------------ begin VTLN -------------\n"); mfcc = r->am->mfcc; alpha_bgn = mfcc->para->vtln_alpha - VTLN_RANGE; alpha_end = mfcc->para->vtln_alpha + VTLN_RANGE; for(alpha = alpha_bgn; alpha <= alpha_end; alpha += VTLN_STEP) { mfcc->para->vtln_alpha = alpha; if (InitFBank(mfcc->wrk, mfcc->para) == FALSE) { jlog("ERROR: VTLN: InitFBank() failed\n"); return; } if (wav2mfcc(recog->speech, recog->speechlen, recog) == FALSE) { jlog("ERROR: VTLN: wav2mfcc() failed\n"); return; } outprob_prepare(&(r->am->hmmwrk), mfcc->param->samplenum); word_align(s->word, s->word_num, mfcc->param, align, r); printf("%f: %f\n", alpha, align->allscore); if (max_score < align->allscore) { max_score = align->allscore; max_alpha = alpha; } } printf("MAX: %f: %f\n", max_alpha, max_score); mfcc->para->vtln_alpha = max_alpha; if (InitFBank(mfcc->wrk, mfcc->para) == FALSE) { jlog("ERROR: VTLN: InitFBank() failed\n"); return; } printf("------------ end VTLN -------------\n"); result_align_free(align); }
/** * <JA> * @brief セグメントの認識再開処理 * * この関数はデコーダベースVADやショートポーズセグメンテーションによって * 入力がセグメントに切られた場合に,その後の認識の再開に関する処理を行う. * 具体的には,入力の認識を開始する前に,前回の入力セグメントにおける * 巻戻し分のMFCC列から認識を開始する. さらに,前回のセグメンテーション時に * 未処理だった残りの音声サンプルがあればそれも処理する. * * @param recog [i/o] エンジンインスタンス * * @return エラー時 -1,正常時 0 を返す. また,この入力断片の処理中に * 文章の区切りが見つかったときは第1パスをここで中断するために 1 を返す. * </JA> * </JA> * <EN> * @brief Resuming recognition for short pause segmentation. * * This function process overlapped data and remaining speech prior * to the next input when input was segmented at last processing. * * @param recog [i/o] engine instance * * @return -1 on error (tell caller to terminate), 0 on success (allow caller * to call me for the next segment), or 1 when an end-of-sentence detected * at this point (in that case caller will stop input and go to 2nd pass) * </EN> * * @callgraph * @callergraph * */ int RealTimeResume(Recog *recog) { MFCCCalc *mfcc; RealBeam *r; boolean ok_p; #ifdef SPSEGMENT_NAIST RecogProcess *p; #endif PROCESS_AM *am; r = &(recog->real); /* 計算用のワークエリアを準備 */ /* prepare work area for calculation */ if (recog->jconf->input.type == INPUT_WAVEFORM) { reset_mfcc(recog); } /* 音響尤度計算用キャッシュを準備 */ /* prepare cache area for acoustic computation of HMM states and mixtures */ for(am=recog->amlist;am;am=am->next) { outprob_prepare(&(am->hmmwrk), r->maxframelen); } /* param にある全パラメータを処理する準備 */ /* prepare to process all data in param */ for(mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) { if (mfcc->param->samplenum == 0) mfcc->valid = FALSE; else mfcc->valid = TRUE; #ifdef RDEBUG printf("Resume: %02d: f=%d\n", mfcc->id, mfcc->mfcc->param->samplenum-1); #endif /* フレーム数をリセット */ /* reset frame count */ mfcc->f = 0; /* MAP-CMN の初期化 */ /* Prepare for MAP-CMN */ if (mfcc->para->cmn || mfcc->para->cvn) CMN_realtime_prepare(mfcc->cmn.wrk); } #ifdef BACKEND_VAD if (recog->jconf->decodeopt.segment) { spsegment_init(recog); } /* not exec pass1 begin callback here */ #else recog->triggered = FALSE; for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) { if (!mfcc->valid) continue; callback_exec(CALLBACK_EVENT_SEGMENT_BEGIN, recog); callback_exec(CALLBACK_EVENT_PASS1_BEGIN, recog); recog->triggered = TRUE; break; } #endif /* param 内の全フレームについて認識処理を進める */ /* proceed recognition for all frames in param */ while(1) { ok_p = TRUE; for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) { if (! mfcc->valid) continue; if (mfcc->f < mfcc->param->samplenum) { mfcc->valid = TRUE; ok_p = FALSE; } else { mfcc->valid = FALSE; } } if (ok_p) { /* すべての MFCC が終わりに達したのでループ終了 */ /* all MFCC has been processed, end of loop */ break; } /* 各インスタンスについて mfcc->f の認識処理を1フレーム進める */ switch (decode_proceed(recog)) { case -1: /* error */ return -1; break; case 0: /* success */ break; case 1: /* segmented */ /* segmented, end procs ([0..f])*/ r->last_is_segmented = TRUE; return 1; /* segmented by this function */ } #ifdef BACKEND_VAD /* check up trigger in case of VAD segmentation */ if (recog->jconf->decodeopt.segment) { if (recog->triggered == FALSE) { if (spsegment_trigger_sync(recog)) { callback_exec(CALLBACK_EVENT_SEGMENT_BEGIN, recog); callback_exec(CALLBACK_EVENT_PASS1_BEGIN, recog); recog->triggered = TRUE; } } } #endif /* call frame-wise callback */ callback_exec(CALLBACK_EVENT_PASS1_FRAME, recog); /* 1フレーム処理が進んだのでポインタを進める */ /* proceed frame pointer */ for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) { if (!mfcc->valid) continue; mfcc->f++; } } /* 前回のセグメント時に入力をシフトしていない分をシフトする */ /* do the last shift here */ if (recog->jconf->input.type == INPUT_WAVEFORM) { memmove(r->window, &(r->window[recog->jconf->input.frameshift]), sizeof(SP16) * (r->windowlen - recog->jconf->input.frameshift)); r->windownum -= recog->jconf->input.frameshift; /* これで再開の準備が整ったので,まずは前回の処理で残っていた音声データから 処理する */ /* now that the search status has been prepared for the next input, we first process the rest unprocessed samples at the last session */ if (r->rest_len > 0) { return(RealTimePipeLine(r->rest_Speech, r->rest_len, recog)); } } /* 新規の入力に対して認識処理は続く… */ /* the recognition process will continue for the newly incoming samples... */ return 0; }
static int proceed_one_frame(Recog *recog) { MFCCCalc *mfcc; RealBeam *r; int maxf; PROCESS_AM *am; int rewind_frame; boolean reprocess; boolean ok_p; r = &(recog->real); /* call recognition start callback */ ok_p = FALSE; maxf = 0; for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) { if (!mfcc->valid) continue; if (maxf < mfcc->f) maxf = mfcc->f; if (mfcc->f == 0) { ok_p = TRUE; } } if (ok_p && maxf == 0) { /* call callback when at least one of MFCC has initial frame */ if (recog->jconf->decodeopt.segment) { #ifdef BACKEND_VAD /* not exec pass1 begin callback here */ #else if (!recog->process_segment) { callback_exec(CALLBACK_EVENT_RECOGNITION_BEGIN, recog); } callback_exec(CALLBACK_EVENT_SEGMENT_BEGIN, recog); callback_exec(CALLBACK_EVENT_PASS1_BEGIN, recog); recog->triggered = TRUE; #endif } else { callback_exec(CALLBACK_EVENT_RECOGNITION_BEGIN, recog); callback_exec(CALLBACK_EVENT_PASS1_BEGIN, recog); recog->triggered = TRUE; } } /* 各インスタンスについて mfcc->f の認識処理を1フレーム進める */ switch (decode_proceed(recog)) { case -1: /* error */ return -1; break; case 0: /* success */ break; case 1: /* segmented */ /* 認識処理のセグメント要求で終わったことをフラグにセット */ /* set flag which indicates that the input has ended with segmentation request */ r->last_is_segmented = TRUE; /* tell the caller to be segmented by this function */ /* 呼び出し元に,ここで入力を切るよう伝える */ return 1; } #ifdef BACKEND_VAD /* check up trigger in case of VAD segmentation */ if (recog->jconf->decodeopt.segment) { if (recog->triggered == FALSE) { if (spsegment_trigger_sync(recog)) { if (!recog->process_segment) { callback_exec(CALLBACK_EVENT_RECOGNITION_BEGIN, recog); } callback_exec(CALLBACK_EVENT_SEGMENT_BEGIN, recog); callback_exec(CALLBACK_EVENT_PASS1_BEGIN, recog); recog->triggered = TRUE; } } } #endif if (spsegment_need_restart(recog, &rewind_frame, &reprocess) == TRUE) { /* set total length to the current frame */ for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) { if (!mfcc->valid) continue; mfcc->param->header.samplenum = mfcc->f + 1; mfcc->param->samplenum = mfcc->f + 1; } /* do rewind for all mfcc here */ spsegment_restart_mfccs(recog, rewind_frame, reprocess); /* also tell adin module to rehash the concurrent audio input */ recog->adin->rehash = TRUE; /* reset outprob cache for all AM */ for(am=recog->amlist;am;am=am->next) { outprob_prepare(&(am->hmmwrk), am->mfcc->param->samplenum); } if (reprocess) { /* process the backstep MFCCs here */ while(1) { ok_p = TRUE; for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) { if (! mfcc->valid) continue; mfcc->f++; if (mfcc->f < mfcc->param->samplenum) { mfcc->valid = TRUE; ok_p = FALSE; } else { mfcc->valid = FALSE; } } if (ok_p) { /* すべての MFCC が終わりに達したのでループ終了 */ /* all MFCC has been processed, end of loop */ for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) { if (! mfcc->valid) continue; mfcc->f--; } break; } /* 各インスタンスについて mfcc->f の認識処理を1フレーム進める */ switch (decode_proceed(recog)) { case -1: /* error */ return -1; break; case 0: /* success */ break; case 1: /* segmented */ /* ignore segmentation while in the backstep segment */ break; } /* call frame-wise callback */ callback_exec(CALLBACK_EVENT_PASS1_FRAME, recog); } } } /* call frame-wise callback if at least one of MFCC is valid at this frame */ for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) { if (mfcc->valid) { callback_exec(CALLBACK_EVENT_PASS1_FRAME, recog); break; } } return 0; }
/** * <JA> * @brief 第1パス平行認識処理の準備 * * 計算用変数をリセットし,各種データを準備する. * この関数は,ある入力(あるいはセグメント)の認識が * 始まる前に呼ばれる. * * </JA> * <EN> * @brief Preparation for the on-the-fly 1st pass decoding. * * Variables are reset and data are prepared for the next input recognition. * * This function will be called before starting each input (segment). * * </EN> * * @param recog [i/o] engine instance * * @return TRUE on success. FALSE on failure. * * @callgraph * @callergraph * */ boolean RealTimePipeLinePrepare(Recog *recog) { RealBeam *r; PROCESS_AM *am; MFCCCalc *mfcc; #ifdef SPSEGMENT_NAIST RecogProcess *p; #endif r = &(recog->real); /* 計算用の変数を初期化 */ /* initialize variables for computation */ r->windownum = 0; /* parameter check */ for(mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) { /* パラメータ初期化 */ /* parameter initialization */ if (recog->jconf->input.speech_input == SP_MFCMODULE) { if (mfc_module_set_header(mfcc, recog) == FALSE) return FALSE; } else { init_param(mfcc); } /* フレームごとのパラメータベクトル保存の領域を確保 */ /* あとで必要に応じて伸長される */ if (param_alloc(mfcc->param, 1, mfcc->param->veclen) == FALSE) { j_internal_error("ERROR: segmented: failed to allocate memory for rest param\n"); } /* フレーム数をリセット */ /* reset frame count */ mfcc->f = 0; } /* 準備した param 構造体のデータのパラメータ型を音響モデルとチェックする */ /* check type coherence between param and hmminfo here */ if (recog->jconf->input.paramtype_check_flag) { for(am=recog->amlist;am;am=am->next) { if (!check_param_coherence(am->hmminfo, am->mfcc->param)) { jlog("ERROR: input parameter type does not match AM\n"); return FALSE; } } } /* 計算用のワークエリアを準備 */ /* prepare work area for calculation */ if (recog->jconf->input.type == INPUT_WAVEFORM) { reset_mfcc(recog); } /* 音響尤度計算用キャッシュを準備 */ /* prepare cache area for acoustic computation of HMM states and mixtures */ for(am=recog->amlist;am;am=am->next) { outprob_prepare(&(am->hmmwrk), r->maxframelen); } #ifdef BACKEND_VAD if (recog->jconf->decodeopt.segment) { /* initialize segmentation parameters */ spsegment_init(recog); } #else recog->triggered = FALSE; #endif #ifdef DEBUG_VTLN_ALPHA_TEST /* store speech */ recog->speechlen = 0; #endif return TRUE; }
/** * <JA> * @brief フレーム同期ビーム探索メイン関数(バッチ処理用) * * 与えられた入力ベクトル列に対して第1パス(フレーム同期ビーム探索)を * 行い,その結果を出力する. また全フレームに渡る単語終端を,第2パス * のために単語トレリス構造体に格納する. * * この関数は入力ベクトル列があらかじめ得られている場合に用いられる. * 第1パスが入力と並列して実行されるオンライン認識の場合, * この関数は用いられず,代わりにこのファイルで定義されている各サブ関数が * 直接 realtime-1stpass.c 内から呼ばれる. * * @param recog [in] エンジンインスタンス * </JA> * <EN> * @brief Frame synchronous beam search: the main (for batch mode) * * This function perform the 1st recognition pass of frame-synchronous beam * search and output the result. It also stores all the word ends in every * input frame to word trellis structure. * * This function will be called if the whole input vector is already given * to the end. When online recognition, where the 1st pass will be * processed in parallel with input, this function will not be used. * In that case, functions defined in this file will be directly called * from functions in realtime-1stpass.c. * * @param recog [in] engine instance * </EN> * @callgraph * @callergraph */ boolean get_back_trellis(Recog *recog) { boolean ok_p; MFCCCalc *mfcc; int rewind_frame; PROCESS_AM *am; boolean reprocess; /* initialize mfcc instances */ for(mfcc=recog->mfcclist;mfcc;mfcc=mfcc->next) { /* mark all as valid, since all frames are fully prepared beforehand */ if (mfcc->param->samplenum == 0) mfcc->valid = FALSE; else mfcc->valid = TRUE; /* set frame pointers to 0 */ mfcc->f = 0; } /* callback of process start */ #ifdef BACKEND_VAD if (recog->jconf->decodeopt.segment) { /* at first time, recognition does not start yet */ /* reset segmentation flags */ spsegment_init(recog); } else { /* execute callback for pass1 begin here */ callback_exec(CALLBACK_EVENT_RECOGNITION_BEGIN, recog); callback_exec(CALLBACK_EVENT_PASS1_BEGIN, recog); recog->triggered = TRUE; } #else if (recog->jconf->decodeopt.segment) { if (!recog->process_segment) { callback_exec(CALLBACK_EVENT_RECOGNITION_BEGIN, recog); } callback_exec(CALLBACK_EVENT_SEGMENT_BEGIN, recog); } else { callback_exec(CALLBACK_EVENT_RECOGNITION_BEGIN, recog); } callback_exec(CALLBACK_EVENT_PASS1_BEGIN, recog); recog->triggered = TRUE; #endif while(1) { ok_p = TRUE; for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) { if (! mfcc->valid) continue; if (mfcc->f < mfcc->param->samplenum) { mfcc->valid = TRUE; ok_p = FALSE; } else { mfcc->valid = FALSE; } } if (ok_p) { /* すべての MFCC が終わりに達したのでループ終了 */ /* all MFCC has been processed, end of loop */ break; } switch (decode_proceed(recog)) { case -1: /* error */ return FALSE; break; case 0: /* success */ break; case 1: /* segmented */ /* 探索中断: 処理された入力は 0 から t-2 まで */ /* search terminated: processed input = [0..t-2] */ /* この時点で第1パスを終了する */ /* end the 1st pass at this point */ decode_end_segmented(recog); /* terminate 1st pass here */ return TRUE; } #ifdef BACKEND_VAD /* check up trigger in case of VAD segmentation */ if (recog->jconf->decodeopt.segment) { if (recog->triggered == FALSE) { if (spsegment_trigger_sync(recog)) { if (!recog->process_segment) { callback_exec(CALLBACK_EVENT_RECOGNITION_BEGIN, recog); } callback_exec(CALLBACK_EVENT_SEGMENT_BEGIN, recog); callback_exec(CALLBACK_EVENT_PASS1_BEGIN, recog); recog->triggered = TRUE; } } } #endif if (spsegment_need_restart(recog, &rewind_frame, &reprocess) == TRUE) { /* do rewind for all mfcc here */ spsegment_restart_mfccs(recog, rewind_frame, reprocess); /* reset outprob cache for all AM */ for(am=recog->amlist;am;am=am->next) { outprob_prepare(&(am->hmmwrk), am->mfcc->param->samplenum); } } /* call frame-wise callback */ callback_exec(CALLBACK_EVENT_PASS1_FRAME, recog); /* 1フレーム処理が進んだのでポインタを進める */ /* proceed frame pointer */ for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) { if (!mfcc->valid) continue; mfcc->f++; } if (recog->process_want_terminate) { /* termination requested */ decode_end_segmented(recog); return TRUE; } } /* 最終フレーム処理を行い,認識の結果出力と終了処理を行う */ decode_end(recog); return TRUE; }