/** * <JA> * @brief 発話区間終了の検知 * * ショートポーズセグメンテーション指定時, * 発話区間の終了を検出する. 無音単語が連続して最尤候補となるフレーム数を * カウントし,一定時間持続後にふたたび音声がトリガした時点で入力を * 区切る. * * SPSEGMENT_NAIST 定義時は,よりセグメント前後・間の無音時間が長い場合を * 想定したデコーダベースの VAD に切り替わる. この場合,音声トリガ検出前 * (r->pass1.after_triger == FALSE)では,仮説を生成しない状態で認識処理を * 続ける. 音声開始を検出したら特徴量を一定長 (r->config->successive.sp_margin) * 分だけ巻き戻して,通常の認識を開始する(r->pass1.after_trigger == TRUE). * 通常の認識中に無音区間が長く (r->config->successive.sp_frame_duration 以上) * 続いたら,そこで入力を区切る. * * @param r [i/o] 音声認識処理インスタンス * @param time [in] 現在の入力フレーム * * @return TRUE (このフレームでの終了を検出したら), FALSE (終了でない場合) * </JA> * <EN> * @brief Speech end point detection. * * Detect end-of-input by duration of short-pause words when short-pause * segmentation is enabled. When a pause word gets maximum score for a * successive frames, the segment will be treated as a pause frames. * When speech re-triggers, the current input will be segmented at that point. * * When SPSEGMENT_NAIST is defined, this function performs extended version * of the short pause segmentation, called "decoder-based VAD". When before * speech trigger (r->pass1.after_trigger == FALSE), it tells the recognition * functions not to generate word trellis and continue calculation. If a * speech trigger is found (not a pause word gets maximum score), the * input frames are 'rewinded' for a certain frame * (r->config->successive.sp_margin) and start the normal recognition * process from the rewinded frames (r->pass1.after_trigger = TRUE). * When a pause frame duration reaches a limit * (r->config->successive.sp_frame_duration), it terminate the search. * * @param r [i/o] recognition process instance * @param time [in] current input frame * * @return TRUE if end-of-input detected at this frame, FALSE if not. * </EN> * @callgraph * @callergraph */ boolean detect_end_of_segment(RecogProcess *r, int time) { FSBeam *d; TRELLIS_ATOM *tre; LOGPROB maxscore = LOG_ZERO; TRELLIS_ATOM *tremax = NULL; int count = 0; boolean detected = FALSE; #ifdef SPSEGMENT_NAIST MFCCCalc *mfcc; WORD_ID wid; int j; TOKEN2 *tk; int startframe; #endif d = &(r->pass1); #ifdef SPSEGMENT_NAIST if (! d->after_trigger) { /* we are in the first long pause segment before trigger */ /* find word end of maximum score from beam status */ for (j = d->n_start; j <= d->n_end; j++) { tk = &(d->tlist[d->tn][d->tindex[d->tn][j]]); if (r->wchmm->stend[tk->node] != WORD_INVALID) { if (maxscore < tk->score) { maxscore = tk->score; wid = r->wchmm->stend[tk->node]; } } } if (maxscore == LOG_ZERO) detected = TRUE; else if (is_sil(wid, r)) detected = TRUE; if (detected) { /***********************/ /* this is noise frame */ /***********************/ /* reset trigger duration */ d->trigger_duration = 0; /* if noise goes more than a certain frame, shrink the noise area to avoid unlimited memory usage */ if (r->am->mfcc->f > SPSEGMENT_NAIST_AUTOSHRINK_LIMIT) { d->want_rewind = TRUE; d->rewind_frame = r->am->mfcc->f - r->config->successive.sp_margin; d->want_rewind_reprocess = FALSE; if (debug2_flag) { jlog("DEBUG: pause exceeded %d, rewind\n", SPSEGMENT_NAIST_AUTOSHRINK_LIMIT); } return FALSE; } /* keep going */ d->want_rewind = FALSE; } else { /************************/ /* this is speech frame */ /************************/ /* increment trigger duration */ d->trigger_duration++; /* if not enough duration, not treat as up trigger */ if (d->trigger_duration < r->config->successive.sp_delay) { /* just continue detection */ return FALSE; } /***************************/ /* found speech up-trigger */ /***************************/ /* set backstep point */ if (r->am->mfcc->f < r->config->successive.sp_margin) { startframe = 0; } else { startframe = r->am->mfcc->f - r->config->successive.sp_margin; } if (debug2_flag) { jlog("DEBUG: speech triggered\n"); jlog("DEBUG: word=[%s] dur=%d\n", r->lm->winfo->woutput[wid], d->trigger_duration); jlog("DEBUG: backstep behind %d (from %d to %d) frame and start process\n", r->config->successive.sp_margin, r->am->mfcc->f, startframe); } /* if the pause segment was short, keep the context of last segment. else, reset the context */ if (r->lmtype == LM_PROB) { if (startframe > 0) { r->sp_break_last_word = WORD_INVALID; } } /* reset sp duration */ d->sp_duration = 0; /* request the caller to rewind the search to the backstep point and re-start with normal search */ d->want_rewind = TRUE; d->rewind_frame = startframe; d->want_rewind_reprocess = TRUE; /* this will enter to normal search in the next processing */ d->after_trigger = TRUE; } /* tell the caller not to segment */ return FALSE; } #endif /* SPSEGMENT_NAIST */ /* look for the best trellis word on the given time frame */ for(tre = r->backtrellis->list; tre != NULL && tre->endtime == time; tre = tre->next) { if (maxscore < tre->backscore) { maxscore = tre->backscore; tremax = tre; } count++; } if (tremax == NULL) { /* no word end: possible in the very beggining of input*/ detected = TRUE; /* assume it's in the short-pause duration */ } else if (count > 0) { /* many words found --- check if maximum is sp */ if (is_sil(tremax->wid, r)) { detected = TRUE; } } #ifdef SPSEGMENT_NAIST /************************************************************************/ /************************************************************************/ /* detected = TRUE if noise frame, or FALSE if speech frame */ /* sp区間持続チェック */ /* check sp segment duration */ if (d->first_sparea) { /* we are in the first sp segment */ if (d->in_sparea && detected) { /* sp continues */ d->sp_duration++; /* when sp continues more than -spdur plus -spmargin, it means that although a speech trigger has been detected by some reason, no actual speech has been found at first. */ /* in this case we force trigger to end this input */ if (d->sp_duration > r->config->successive.sp_delay + r->config->successive.sp_margin + r->config->successive.sp_frame_duration) { d->in_sparea = FALSE; d->first_sparea = FALSE; if (debug2_flag) { jlog("DEBUG: no valid speech starts, force trigger at %d\n", r->am->mfcc->f); } } } else if (d->in_sparea && !detected) { /* found speech frame */ d->in_sparea = FALSE; d->first_sparea = FALSE; if (debug2_flag) { jlog("DEBUG: speech segment start at %d\n", r->am->mfcc->f); } } } else { /* we are either in speech segment, or trailing sp segment */ if (!d->in_sparea) { /* we are in speech segment */ if (detected) { /* detected end of speech segment (begin of sp segment) */ /* 一時的に開始フレームとしてマーク */ /* mark this frame as "temporal" begging of short-pause segment */ d->tmp_sparea_start = time; #ifdef SP_BREAK_RESUME_WORD_BEGIN if (r->lmtype == LM_PROB) { /* sp 区間開始時点の最尤単語を保存 */ /* store the best word in this frame as resuming word */ d->tmp_sp_break_last_word = tremax ? tremax->wid : WORD_INVALID; } #endif d->in_sparea = TRUE; d->sp_duration = 1; } else { /* speech continues */ /* keep recognizing */ } } else { /* we are in trailing sp segment */ if (detected) { /* short pause frame continues */ d->sp_duration++; /* keep word as the "beggining" of next sp segment */ if (r->lmtype == LM_PROB) { #ifdef SP_BREAK_RESUME_WORD_BEGIN /* if this segment has triggered by (tremax == NULL) (in case the first several frame of input), the sp word (to be used as resuming word in the next segment) is not yet set. it will be detected here */ if (d->tmp_sp_break_last_word == WORD_INVALID) { if (tremax != NULL) d->tmp_sp_break_last_word = tremax->wid; } #else /* resume word at the "end" of sp segment */ /* simply update the best sp word */ if (tremax != NULL) d->last_tre_word = tremax->wid; #endif } if (d->sp_duration >= r->config->successive.sp_frame_duration) { /* silence over, segment the recognition here */ /* store begging frame of the segment */ //d->sparea_start = d->tmp_sparea_start; r->am->mfcc->sparea_start = time - r->config->successive.sp_frame_duration; if (r->lmtype == LM_PROB) { #ifdef SP_BREAK_RESUME_WORD_BEGIN /* resume word = most likely sp word on beginning frame of the segment */ r->sp_break_last_word = d->tmp_sp_break_last_word; #else /* resume word = most likely sp word on end frame of the segment */ r->sp_break_last_word = d->last_tre_word; #endif } if (debug2_flag) { jlog("DEBUG: trailing silence end, end this segment at %d\n", r->am->mfcc->f); } d->after_trigger = FALSE; d->trigger_duration = 0; d->want_rewind = FALSE; /*** segment: [sparea_start - time-1] ***/ return(TRUE); } /* else, keep recognition */ } else { /* speech re-triggered */ /* keep recognition */ d->in_sparea = FALSE; } } } d->want_rewind = FALSE; #else /* ~SPSEGMENT_NAIST */ /************************************************************************/ /************************************************************************/ /* sp区間持続チェック */ /* check sp segment duration */ if (d->in_sparea && detected) { /* we are already in sp segment and sp continues */ d->sp_duration++; /* increment count */ #ifdef SP_BREAK_RESUME_WORD_BEGIN /* resume word at the "beggining" of sp segment */ /* if this segment has triggered by (tremax == NULL) (in case the first several frame of input), the sp word (to be used as resuming word in the next segment) is not yet set. it will be detected here */ if (d->tmp_sp_break_last_word == WORD_INVALID) { if (tremax != NULL) d->tmp_sp_break_last_word = tremax->wid; } #else /* resume word at the "end" of sp segment */ /* simply update the best sp word */ if (tremax != NULL) d->last_tre_word = tremax->wid; #endif } /* sp区間開始チェック */ /* check if sp segment begins at this frame */ else if (!d->in_sparea && detected) { /* 一時的に開始フレームとしてマーク */ /* mark this frame as "temporal" begging of short-pause segment */ d->tmp_sparea_start = time; #ifdef SP_BREAK_RESUME_WORD_BEGIN /* sp 区間開始時点の最尤単語を保存 */ /* store the best word in this frame as resuming word */ d->tmp_sp_break_last_word = tremax ? tremax->wid : WORD_INVALID; #endif d->in_sparea = TRUE; /* yes, we are in sp segment */ d->sp_duration = 1; /* initialize duration count */ #ifdef SP_BREAK_DEBUG jlog("DEBUG: sp start %d\n", time); #endif /* SP_BREAK_DEBUG */ } /* sp 区間終了チェック */ /* check if sp segment ends at this frame */ else if (d->in_sparea && !detected) { /* (time-1) is end frame of pause segment */ d->in_sparea = FALSE; /* we are not in sp segment */ #ifdef SP_BREAK_DEBUG jlog("DEBUG: sp end %d\n", time); #endif /* SP_BREAK_DEBUG */ /* sp 区間長チェック */ /* check length of the duration*/ if (d->sp_duration < r->config->successive.sp_frame_duration) { /* 短すぎる: 第1パスを中断せず続行 */ /* too short segment: not break, continue 1st pass */ #ifdef SP_BREAK_DEBUG jlog("DEBUG: too short (%d<%d), ignored\n", d->sp_duration, r->config->successive.sp_frame_duration); #endif /* SP_BREAK_DEBUG */ } else if (d->first_sparea) { /* 最初のsp区間は silB にあたるので,第1パスを中断せず続行 */ /* do not break at first sp segment: they are silB */ d->first_sparea = FALSE; #ifdef SP_BREAK_DEBUG jlog("DEBUG: first silence, ignored\n"); #endif /* SP_BREAK_DEBUG */ } else { /* 区間終了確定, 第1パスを中断して第2パスへ */ /* break 1st pass */ #ifdef SP_BREAK_DEBUG jlog("DEBUG: >> segment [%d..%d]\n", r->am->mfcc->sparea_start, time-1); #endif /* SP_BREAK_DEBUG */ /* store begging frame of the segment */ r->am->mfcc->sparea_start = d->tmp_sparea_start; #ifdef SP_BREAK_RESUME_WORD_BEGIN /* resume word = most likely sp word on beginning frame of the segment */ r->sp_break_last_word = d->tmp_sp_break_last_word; #else /* resume word = most likely sp word on end frame of the segment */ r->sp_break_last_word = d->last_tre_word; #endif /*** segment: [sparea_start - time-1] ***/ return(TRUE); } } #endif /* ~SPSEGMENT_NAIST */ #ifdef SP_BREAK_EVAL jlog("DEBUG: [%d %d %d]\n", time, count, (detected) ? 50 : 0); #endif return (FALSE); }
/** * <EN> * @brief Launch a recognition process instance. * * This function will create an recognition process instance * using the given SEARCH configuration, and launch recognizer for * the search. Then the created instance will be installed to the * engine instance. The sconf should be registered to the global * jconf before calling this function. * * </EN> * * <JA> * @brief 認識処理インスタンスを立ち上げる. * * この関数は,与えられた SEARCH 設定に従って 認識処理インスタンスを生成し, * 対応する音声認識器を構築します.その後,その生成された認識処理インスタンスは * 新たにエンジンインスタンスに登録されます.SEARCH設定はこの関数を * 呼ぶ前にあらかじめ全体設定jconfに登録されている必要があります. * * </JA> * * @param recog [i/o] engine instance * @param sconf [in] SEARCH configuration to launch * * @return TRUE on success, or FALSE on error. * * @callgraph * @callergraph * @ingroup instance * */ boolean j_launch_recognition_instance(Recog *recog, JCONF_SEARCH *sconf) { RecogProcess *p; PROCESS_AM *am; PROCESS_LM *lm; jlog("STAT: composing recognizer instance SR%02d %s (AM%02d %s, LM%02d %s)\n", sconf->id, sconf->name, sconf->amconf->id, sconf->amconf->name, sconf->lmconf->id, sconf->lmconf->name); /* allocate recognition instance */ p = j_recogprocess_new(recog, sconf); /* assign corresponding AM instance and LM instance to use */ for(lm=recog->lmlist;lm;lm=lm->next) { if (sconf->lmconf == lm->config) { for(am=recog->amlist;am;am=am->next) { if (sconf->amconf == am->config) { p->am = am; p->lm = lm; } } } } if (p->config->sw.triphone_check_flag && p->am->hmminfo->is_triphone) { /* go into interactive triphone HMM check mode */ hmm_check(p); } /******************************************/ /******** set work area and flags *********/ /******************************************/ /* copy values of sub instances for handly access during recognition */ /* set lm type */ p->lmtype = p->lm->lmtype; p->lmvar = p->lm->lmvar; p->graphout = p->config->graph.enabled; /* set flag for context dependent handling */ if (p->config->force_ccd_handling) { p->ccd_flag = p->config->ccd_handling; } else { if (p->am->hmminfo->is_triphone) { p->ccd_flag = TRUE; } else { p->ccd_flag = FALSE; } } /* iwsp prepare */ if (p->lm->config->enable_iwsp) { if (p->am->hmminfo->multipath) { /* find short-pause model */ if (p->am->hmminfo->sp == NULL) { jlog("ERROR: iwsp enabled but no short pause model \"%s\" in hmmdefs\n", p->am->config->spmodel_name); return FALSE; } p->am->hmminfo->iwsp_penalty = p->am->config->iwsp_penalty; } else { jlog("ERROR: \"-iwsp\" needs multi-path mode\n"); jlog("ERROR: you should use multi-path AM, or specify \"-multipath\" with \"-iwsp\"\n"); return FALSE; } } /* for short-pause segmentation */ if (p->config->successive.enabled) { if (p->config->successive.pausemodelname) { /* pause model name string specified, divide it and store to p */ char *s; int n; p->pass1.pausemodelnames = (char*)mymalloc(strlen(p->config->successive.pausemodelname)+1); strcpy(p->pass1.pausemodelnames, p->config->successive.pausemodelname); n = 0; for (s = strtok(p->pass1.pausemodelnames, " ,"); s; s = strtok(NULL, " ,")) { n++; } p->pass1.pausemodelnum = n; p->pass1.pausemodel = (char **)mymalloc(sizeof(char *) * n); strcpy(p->pass1.pausemodelnames, p->config->successive.pausemodelname); n = 0; for (s = strtok(p->pass1.pausemodelnames, " ,"); s; s = strtok(NULL, " ,")) { p->pass1.pausemodel[n++] = s; } } else { p->pass1.pausemodel = NULL; } /* check if pause word exists on dictionary */ { WORD_ID w; boolean ok_p; ok_p = FALSE; for(w=0;w<p->lm->winfo->num;w++) { if (is_sil(w, p)) { ok_p = TRUE; break; } } if (!ok_p) { #ifdef SPSEGMENT_NAIST jlog("Error: no pause word in dictionary needed for decoder-based VAD\n"); #else jlog("Error: no pause word in dictionary needed for short-pause segmentation\n"); #endif jlog("Error: you should have at least one pause word in dictionary\n"); jlog("Error: you can specify pause model names by \"-pausemodels\"\n"); return FALSE; } } } /**********************************************/ /******** set model-specific defaults *********/ /**********************************************/ if (p->lmtype == LM_PROB) { /* set default lm parameter if not specified */ if (!p->config->lmp.lmp_specified) { if (p->am->hmminfo->is_triphone) { p->config->lmp.lm_weight = DEFAULT_LM_WEIGHT_TRI_PASS1; p->config->lmp.lm_penalty = DEFAULT_LM_PENALTY_TRI_PASS1; } else { p->config->lmp.lm_weight = DEFAULT_LM_WEIGHT_MONO_PASS1; p->config->lmp.lm_penalty = DEFAULT_LM_PENALTY_MONO_PASS1; } } if (!p->config->lmp.lmp2_specified) { if (p->am->hmminfo->is_triphone) { p->config->lmp.lm_weight2 = DEFAULT_LM_WEIGHT_TRI_PASS2; p->config->lmp.lm_penalty2 = DEFAULT_LM_PENALTY_TRI_PASS2; } else { p->config->lmp.lm_weight2 = DEFAULT_LM_WEIGHT_MONO_PASS2; p->config->lmp.lm_penalty2 = DEFAULT_LM_PENALTY_MONO_PASS2; } } if (p->config->lmp.lmp_specified != p->config->lmp.lmp2_specified) { jlog("WARNING: m_fusion: only -lmp or -lmp2 specified, LM weights may be unbalanced\n"); } } /****************************/ /******* build wchmm ********/ /****************************/ if (p->lmtype == LM_DFA) { /* execute generation of global grammar and build of wchmm */ multigram_build(p); /* some modification occured if return TRUE */ } if (p->lmtype == LM_PROB) { /* build wchmm with N-gram */ p->wchmm = wchmm_new(); p->wchmm->lmtype = p->lmtype; p->wchmm->lmvar = p->lmvar; p->wchmm->ccd_flag = p->ccd_flag; p->wchmm->category_tree = FALSE; p->wchmm->hmmwrk = &(p->am->hmmwrk); /* assign models */ p->wchmm->ngram = p->lm->ngram; if (p->lmvar == LM_NGRAM_USER) { /* register LM functions for 1st pass here */ p->wchmm->uni_prob_user = p->lm->lmfunc.uniprob; p->wchmm->bi_prob_user = p->lm->lmfunc.biprob; } p->wchmm->winfo = p->lm->winfo; p->wchmm->hmminfo = p->am->hmminfo; if (p->wchmm->category_tree) { if (p->config->pass1.old_tree_function_flag) { if (build_wchmm(p->wchmm, p->lm->config) == FALSE) { jlog("ERROR: m_fusion: error in bulding wchmm\n"); return FALSE; } } else { if (build_wchmm2(p->wchmm, p->lm->config) == FALSE) { jlog("ERROR: m_fusion: error in bulding wchmm\n"); return FALSE; } } } else { if (build_wchmm2(p->wchmm, p->lm->config) == FALSE) { jlog("ERROR: m_fusion: error in bulding wchmm\n"); return FALSE; } } /* 起動時 -check でチェックモードへ */ if (p->config->sw.wchmm_check_flag) { wchmm_check_interactive(p->wchmm); } /* set beam width */ /* guess beam width from models, when not specified */ p->trellis_beam_width = set_beam_width(p->wchmm, p->config->pass1.specified_trellis_beam_width); /* initialize cache for factoring */ max_successor_cache_init(p->wchmm); } /* backtrellis initialization */ p->backtrellis = (BACKTRELLIS *)mymalloc(sizeof(BACKTRELLIS)); bt_init(p->backtrellis); /* prepare work area for 2nd pass */ wchmm_fbs_prepare(p); jlog("STAT: SR%02d %s composed\n", sconf->id, sconf->name); if (sconf->sw.start_inactive) { /* start inactive */ p->active = -1; } else { /* book activation for the recognition */ p->active = 1; } if (p->lmtype == LM_DFA) { if (p->lm->winfo == NULL || (p->lmvar == LM_DFA_GRAMMAR && p->lm->dfa == NULL)) { /* make this instance inactive */ p->active = -1; } } return TRUE; }