예제 #1
0
/** 
 * <JA>
 * @brief  発話区間終了の検知
 * 
 * ショートポーズセグメンテーション指定時,
 * 発話区間の終了を検出する. 無音単語が連続して最尤候補となるフレーム数を
 * カウントし,一定時間持続後にふたたび音声がトリガした時点で入力を
 * 区切る. 
 *
 * SPSEGMENT_NAIST 定義時は,よりセグメント前後・間の無音時間が長い場合を
 * 想定したデコーダベースの VAD に切り替わる. この場合,音声トリガ検出前
 * (r->pass1.after_triger == FALSE)では,仮説を生成しない状態で認識処理を
 * 続ける. 音声開始を検出したら特徴量を一定長 (r->config->successive.sp_margin)
 * 分だけ巻き戻して,通常の認識を開始する(r->pass1.after_trigger == TRUE). 
 * 通常の認識中に無音区間が長く (r->config->successive.sp_frame_duration 以上)
 * 続いたら,そこで入力を区切る. 
 * 
 * @param r [i/o] 音声認識処理インスタンス
 * @param time [in] 現在の入力フレーム
 * 
 * @return TRUE (このフレームでの終了を検出したら), FALSE (終了でない場合)
 * </JA>
 * <EN>
 * @brief  Speech end point detection.
 * 
 * Detect end-of-input by duration of short-pause words when short-pause
 * segmentation is enabled.  When a pause word gets maximum score for a
 * successive frames, the segment will be treated as a pause frames.
 * When speech re-triggers, the current input will be segmented at that point.
 *
 * When SPSEGMENT_NAIST is defined, this function performs extended version
 * of the short pause segmentation, called "decoder-based VAD".  When before
 * speech trigger (r->pass1.after_trigger == FALSE), it tells the recognition
 * functions not to generate word trellis and continue calculation.  If a
 * speech trigger is found (not a pause word gets maximum score), the
 * input frames are 'rewinded' for a certain frame
 * (r->config->successive.sp_margin) and start the normal recognition
 * process from the rewinded frames (r->pass1.after_trigger = TRUE).
 * When a pause frame duration reaches a limit
 * (r->config->successive.sp_frame_duration), it terminate the search.
 * 
 * @param r [i/o] recognition process instance
 * @param time [in] current input frame
 * 
 * @return TRUE if end-of-input detected at this frame, FALSE if not.
 * </EN>
 * @callgraph
 * @callergraph
 */
boolean
detect_end_of_segment(RecogProcess *r, int time)
{
  FSBeam *d;
  TRELLIS_ATOM *tre;
  LOGPROB maxscore = LOG_ZERO;
  TRELLIS_ATOM *tremax = NULL;
  int count = 0;
  boolean detected = FALSE;
#ifdef SPSEGMENT_NAIST
  MFCCCalc *mfcc;
  WORD_ID wid;
  int j;
  TOKEN2 *tk;
  int startframe;
#endif

  d = &(r->pass1);

#ifdef SPSEGMENT_NAIST

  if (! d->after_trigger) {
    /* we are in the first long pause segment before trigger */

    /* find word end of maximum score from beam status */
    for (j = d->n_start; j <= d->n_end; j++) {
      tk = &(d->tlist[d->tn][d->tindex[d->tn][j]]);
      if (r->wchmm->stend[tk->node] != WORD_INVALID) {
        if (maxscore < tk->score) {
          maxscore = tk->score;
          wid = r->wchmm->stend[tk->node];
        }
      }
    }
    if (maxscore == LOG_ZERO) detected = TRUE;
    else if (is_sil(wid, r)) detected = TRUE;
 
    if (detected) {
      /***********************/
      /* this is noise frame */
      /***********************/

      /* reset trigger duration */
      d->trigger_duration = 0;
      
      /* if noise goes more than a certain frame, shrink the noise area
         to avoid unlimited memory usage */
      if (r->am->mfcc->f > SPSEGMENT_NAIST_AUTOSHRINK_LIMIT) {
        d->want_rewind = TRUE;
        d->rewind_frame = r->am->mfcc->f - r->config->successive.sp_margin;
        d->want_rewind_reprocess = FALSE;
        if (debug2_flag) {
          jlog("DEBUG: pause exceeded %d, rewind\n", SPSEGMENT_NAIST_AUTOSHRINK_LIMIT);
        }
        return FALSE;
      }

      /* keep going */
      d->want_rewind = FALSE;

    } else {
      /************************/
      /* this is speech frame */
      /************************/

      /* increment trigger duration */
      d->trigger_duration++;
      
      /* if not enough duration, not treat as up trigger */
      if (d->trigger_duration < r->config->successive.sp_delay) {
        /* just continue detection */
        return FALSE;
      }

      /***************************/
      /* found speech up-trigger */
      /***************************/
      /* set backstep point */
      if (r->am->mfcc->f < r->config->successive.sp_margin) {
        startframe = 0;
      } else {
        startframe = r->am->mfcc->f - r->config->successive.sp_margin;
      }
      if (debug2_flag) {
        jlog("DEBUG: speech triggered\n");
        jlog("DEBUG: word=[%s] dur=%d\n", r->lm->winfo->woutput[wid], d->trigger_duration);
        jlog("DEBUG: backstep behind %d (from %d to %d) frame and start process\n", r->config->successive.sp_margin, r->am->mfcc->f, startframe);
      }

      /* if the pause segment was short, keep the context of last segment.
         else, reset the context */
      if (r->lmtype == LM_PROB) {
        if (startframe > 0) {
          r->sp_break_last_word = WORD_INVALID;
        }
      }

      /* reset sp duration */
      d->sp_duration = 0;

      /* request the caller to rewind the search to the backstep point and
         re-start with normal search */
      d->want_rewind = TRUE;
      d->rewind_frame = startframe;
      d->want_rewind_reprocess = TRUE;
      /* this will enter to normal search in the next processing */
      d->after_trigger = TRUE;
    }
    /* tell the caller not to segment */
    return FALSE;
  }

#endif /* SPSEGMENT_NAIST */

  /* look for the best trellis word on the given time frame */
  for(tre = r->backtrellis->list; tre != NULL && tre->endtime == time; tre = tre->next) {
    if (maxscore < tre->backscore) {
      maxscore = tre->backscore;
      tremax = tre;
    }
    count++;
  }
  if (tremax == NULL) { /* no word end: possible in the very beggining of input*/
    detected = TRUE;            /* assume it's in the short-pause duration */
  } else if (count > 0) {       /* many words found --- check if maximum is sp */
    if (is_sil(tremax->wid, r)) {
      detected = TRUE;
    }
  }


#ifdef SPSEGMENT_NAIST
  /************************************************************************/
  /************************************************************************/

  /* detected = TRUE if noise frame, or FALSE if speech frame */

  /* sp区間持続チェック */
  /* check sp segment duration */
  if (d->first_sparea) {
    /* we are in the first sp segment */
    if (d->in_sparea && detected) {
      /* sp continues */
      d->sp_duration++;
      /* when sp continues more than -spdur plus -spmargin,
	 it means that although a speech trigger has been detected
	 by some reason, no actual speech has been found at first. */
      /* in this case we force trigger to end this input */
      if (d->sp_duration > r->config->successive.sp_delay + r->config->successive.sp_margin + r->config->successive.sp_frame_duration) {
	d->in_sparea = FALSE;
	d->first_sparea = FALSE;
	if (debug2_flag) {
	  jlog("DEBUG: no valid speech starts, force trigger at %d\n", r->am->mfcc->f);
	}
      }
    } else if (d->in_sparea && !detected) {
      /* found speech frame */
      d->in_sparea = FALSE;
      d->first_sparea = FALSE;
      if (debug2_flag) {
        jlog("DEBUG: speech segment start at %d\n", r->am->mfcc->f);
      }
    }
  } else {
    /* we are either in speech segment, or trailing sp segment */
    if (!d->in_sparea) {
      /* we are in speech segment */
      if (detected) {
        /* detected end of speech segment (begin of sp segment) */
        /* 一時的に開始フレームとしてマーク */
        /* mark this frame as "temporal" begging of short-pause segment */
        d->tmp_sparea_start = time;
#ifdef SP_BREAK_RESUME_WORD_BEGIN
        if (r->lmtype == LM_PROB) {
          /* sp 区間開始時点の最尤単語を保存 */
          /* store the best word in this frame as resuming word */
          d->tmp_sp_break_last_word = tremax ? tremax->wid : WORD_INVALID;
        }
#endif
        d->in_sparea = TRUE;
        d->sp_duration = 1;
      } else {
        /* speech continues */
        /* keep recognizing */
      }
    } else {
      /* we are in trailing sp segment */
      if (detected) {
        /* short pause frame continues */
        d->sp_duration++;
        /* keep word as the "beggining" of next sp segment */
        if (r->lmtype == LM_PROB) {
#ifdef SP_BREAK_RESUME_WORD_BEGIN
          /* if this segment has triggered by (tremax == NULL) (in case the first
             several frame of input), the sp word (to be used as resuming
             word in the next segment) is not yet set. it will be detected here */
          if (d->tmp_sp_break_last_word == WORD_INVALID) {
            if (tremax != NULL) d->tmp_sp_break_last_word = tremax->wid;
          }
#else
          /* resume word at the "end" of sp segment */
          /* simply update the best sp word */
          if (tremax != NULL) d->last_tre_word = tremax->wid;
#endif
        }

        if (d->sp_duration >= r->config->successive.sp_frame_duration) {
          /* silence over, segment the recognition here */
          /* store begging frame of the segment */
          //d->sparea_start = d->tmp_sparea_start;
          r->am->mfcc->sparea_start = time - r->config->successive.sp_frame_duration;
          if (r->lmtype == LM_PROB) {
#ifdef SP_BREAK_RESUME_WORD_BEGIN
            /* resume word = most likely sp word on beginning frame of the segment */
            r->sp_break_last_word = d->tmp_sp_break_last_word;
#else
            /* resume word = most likely sp word on end frame of the segment */
            r->sp_break_last_word = d->last_tre_word;
#endif
          }

          if (debug2_flag) {
            jlog("DEBUG: trailing silence end, end this segment at %d\n", r->am->mfcc->f);
          }
          
          d->after_trigger = FALSE;
          d->trigger_duration = 0;
          d->want_rewind = FALSE;

          /*** segment: [sparea_start - time-1] ***/
          return(TRUE);
        }
        /* else, keep recognition */
      } else {
        /* speech re-triggered */
        /* keep recognition */
        d->in_sparea = FALSE;
      }
    }
  }

  d->want_rewind = FALSE;


#else  /* ~SPSEGMENT_NAIST */
  /************************************************************************/
  /************************************************************************/

  /* sp区間持続チェック */
  /* check sp segment duration */
  if (d->in_sparea && detected) {       /* we are already in sp segment and sp continues */
    d->sp_duration++;           /* increment count */
#ifdef SP_BREAK_RESUME_WORD_BEGIN
    /* resume word at the "beggining" of sp segment */
    /* if this segment has triggered by (tremax == NULL) (in case the first
       several frame of input), the sp word (to be used as resuming
       word in the next segment) is not yet set. it will be detected here */
    if (d->tmp_sp_break_last_word == WORD_INVALID) {
      if (tremax != NULL) d->tmp_sp_break_last_word = tremax->wid;
    }
#else
    /* resume word at the "end" of sp segment */
    /* simply update the best sp word */
    if (tremax != NULL) d->last_tre_word = tremax->wid;
#endif
  }

  /* sp区間開始チェック */
  /* check if sp segment begins at this frame */
  else if (!d->in_sparea && detected) {
    /* 一時的に開始フレームとしてマーク */
    /* mark this frame as "temporal" begging of short-pause segment */
    d->tmp_sparea_start = time;
#ifdef SP_BREAK_RESUME_WORD_BEGIN
    /* sp 区間開始時点の最尤単語を保存 */
    /* store the best word in this frame as resuming word */
    d->tmp_sp_break_last_word = tremax ? tremax->wid : WORD_INVALID;
#endif
    d->in_sparea = TRUE;                /* yes, we are in sp segment */
    d->sp_duration = 1;         /* initialize duration count */
#ifdef SP_BREAK_DEBUG
    jlog("DEBUG: sp start %d\n", time);
#endif /* SP_BREAK_DEBUG */
  }
  
  /* sp 区間終了チェック */
  /* check if sp segment ends at this frame */
  else if (d->in_sparea && !detected) {
    /* (time-1) is end frame of pause segment */
    d->in_sparea = FALSE;               /* we are not in sp segment */
#ifdef SP_BREAK_DEBUG
    jlog("DEBUG: sp end %d\n", time);
#endif /* SP_BREAK_DEBUG */
    /* sp 区間長チェック */
    /* check length of the duration*/
    if (d->sp_duration < r->config->successive.sp_frame_duration) {
      /* 短すぎる: 第1パスを中断せず続行 */
      /* too short segment: not break, continue 1st pass */
#ifdef SP_BREAK_DEBUG
      jlog("DEBUG: too short (%d<%d), ignored\n", d->sp_duration, r->config->successive.sp_frame_duration);
#endif /* SP_BREAK_DEBUG */
    } else if (d->first_sparea) {
      /* 最初のsp区間は silB にあたるので,第1パスを中断せず続行 */
      /* do not break at first sp segment: they are silB */
      d->first_sparea = FALSE;
#ifdef SP_BREAK_DEBUG
      jlog("DEBUG: first silence, ignored\n");
#endif /* SP_BREAK_DEBUG */
    } else {
      /* 区間終了確定, 第1パスを中断して第2パスへ */
      /* break 1st pass */
#ifdef SP_BREAK_DEBUG
      jlog("DEBUG: >> segment [%d..%d]\n", r->am->mfcc->sparea_start, time-1);
#endif /* SP_BREAK_DEBUG */
      /* store begging frame of the segment */
      r->am->mfcc->sparea_start = d->tmp_sparea_start;
#ifdef SP_BREAK_RESUME_WORD_BEGIN
      /* resume word = most likely sp word on beginning frame of the segment */
      r->sp_break_last_word = d->tmp_sp_break_last_word;
#else
      /* resume word = most likely sp word on end frame of the segment */
      r->sp_break_last_word = d->last_tre_word;
#endif

      /*** segment: [sparea_start - time-1] ***/
      return(TRUE);
    }
  }


#endif  /* ~SPSEGMENT_NAIST */

    
#ifdef SP_BREAK_EVAL
  jlog("DEBUG: [%d %d %d]\n", time, count, (detected) ? 50 : 0);
#endif
  return (FALSE);
}
예제 #2
0
파일: m_fusion.c 프로젝트: xawirq/julius
/** 
 * <EN>
 * @brief  Launch a recognition process instance.
 *
 * This function will create an recognition process instance
 * using the given SEARCH configuration, and launch recognizer for
 * the search.  Then the created instance will be installed to the
 * engine instance.  The sconf should be registered to the global
 * jconf before calling this function.
 *
 * </EN>
 *
 * <JA>
 * @brief 認識処理インスタンスを立ち上げる.
 *
 * この関数は,与えられた SEARCH 設定に従って 認識処理インスタンスを生成し,
 * 対応する音声認識器を構築します.その後,その生成された認識処理インスタンスは
 * 新たにエンジンインスタンスに登録されます.SEARCH設定はこの関数を
 * 呼ぶ前にあらかじめ全体設定jconfに登録されている必要があります.
 * 
 * </JA>
 * 
 * @param recog [i/o] engine instance
 * @param sconf [in] SEARCH configuration to launch
 * 
 * @return TRUE on success, or FALSE on error.
 *
 * @callgraph
 * @callergraph
 * @ingroup instance
 * 
 */
boolean
j_launch_recognition_instance(Recog *recog, JCONF_SEARCH *sconf)
{
  RecogProcess *p;
  PROCESS_AM *am;
  PROCESS_LM *lm;

  jlog("STAT: composing recognizer instance SR%02d %s (AM%02d %s, LM%02d %s)\n", sconf->id, sconf->name, sconf->amconf->id, sconf->amconf->name, sconf->lmconf->id, sconf->lmconf->name);

  /* allocate recognition instance */
  p = j_recogprocess_new(recog, sconf);

  /* assign corresponding AM instance and LM instance to use */
  for(lm=recog->lmlist;lm;lm=lm->next) {
    if (sconf->lmconf == lm->config) {
      for(am=recog->amlist;am;am=am->next) {
	if (sconf->amconf == am->config) {
	  p->am = am;
	  p->lm = lm;
	}
      }
    }
  }

  if (p->config->sw.triphone_check_flag && p->am->hmminfo->is_triphone) {
    /* go into interactive triphone HMM check mode */
    hmm_check(p);
  }
  
  /******************************************/
  /******** set work area and flags *********/
  /******************************************/

  /* copy values of sub instances for handly access during recognition */
  /* set lm type */
  p->lmtype = p->lm->lmtype;
  p->lmvar  = p->lm->lmvar;
  p->graphout = p->config->graph.enabled;
  
  /* set flag for context dependent handling */
  if (p->config->force_ccd_handling) {
    p->ccd_flag = p->config->ccd_handling;
  } else {
    if (p->am->hmminfo->is_triphone) {
      p->ccd_flag = TRUE;
    } else {
      p->ccd_flag = FALSE;
    }
  }

  /* iwsp prepare */
  if (p->lm->config->enable_iwsp) {
    if (p->am->hmminfo->multipath) {
      /* find short-pause model */
      if (p->am->hmminfo->sp == NULL) {
	jlog("ERROR: iwsp enabled but no short pause model \"%s\" in hmmdefs\n", p->am->config->spmodel_name);
	return FALSE;
      }
      p->am->hmminfo->iwsp_penalty = p->am->config->iwsp_penalty;
    } else {
      jlog("ERROR: \"-iwsp\" needs multi-path mode\n");
      jlog("ERROR: you should use multi-path AM, or specify \"-multipath\" with \"-iwsp\"\n");
      return FALSE;
    }
  }

  /* for short-pause segmentation  */
  if (p->config->successive.enabled) {
    if (p->config->successive.pausemodelname) {
      /* pause model name string specified, divide it and store to p */
      char *s;
      int n;
      p->pass1.pausemodelnames = (char*)mymalloc(strlen(p->config->successive.pausemodelname)+1);
      strcpy(p->pass1.pausemodelnames, p->config->successive.pausemodelname);
      n = 0;
      for (s = strtok(p->pass1.pausemodelnames, " ,"); s; s = strtok(NULL, " ,")) {
	n++;
      }
      p->pass1.pausemodelnum = n;
      p->pass1.pausemodel = (char **)mymalloc(sizeof(char *) * n);
      strcpy(p->pass1.pausemodelnames, p->config->successive.pausemodelname);
      n = 0;
      for (s = strtok(p->pass1.pausemodelnames, " ,"); s; s = strtok(NULL, " ,")) {
	p->pass1.pausemodel[n++] = s;
      }
    } else {
      p->pass1.pausemodel = NULL;
    }
    /* check if pause word exists on dictionary */
    {
      WORD_ID w;
      boolean ok_p;
      ok_p = FALSE;
      for(w=0;w<p->lm->winfo->num;w++) {
	if (is_sil(w, p)) {
	  ok_p = TRUE;
	  break;
	}
      }
      if (!ok_p) {
#ifdef SPSEGMENT_NAIST
	jlog("Error: no pause word in dictionary needed for decoder-based VAD\n");
#else
	jlog("Error: no pause word in dictionary needed for short-pause segmentation\n");
#endif
	jlog("Error: you should have at least one pause word in dictionary\n");
	jlog("Error: you can specify pause model names by \"-pausemodels\"\n");
	return FALSE;
      }
    }
  }

  /**********************************************/
  /******** set model-specific defaults *********/
  /**********************************************/
  if (p->lmtype == LM_PROB) {
    /* set default lm parameter if not specified */
    if (!p->config->lmp.lmp_specified) {
      if (p->am->hmminfo->is_triphone) {
	p->config->lmp.lm_weight = DEFAULT_LM_WEIGHT_TRI_PASS1;
	p->config->lmp.lm_penalty = DEFAULT_LM_PENALTY_TRI_PASS1;
      } else {
	p->config->lmp.lm_weight = DEFAULT_LM_WEIGHT_MONO_PASS1;
	p->config->lmp.lm_penalty = DEFAULT_LM_PENALTY_MONO_PASS1;
      }
    }
    if (!p->config->lmp.lmp2_specified) {
      if (p->am->hmminfo->is_triphone) {
	p->config->lmp.lm_weight2 = DEFAULT_LM_WEIGHT_TRI_PASS2;
	p->config->lmp.lm_penalty2 = DEFAULT_LM_PENALTY_TRI_PASS2;
      } else {
	p->config->lmp.lm_weight2 = DEFAULT_LM_WEIGHT_MONO_PASS2;
	p->config->lmp.lm_penalty2 = DEFAULT_LM_PENALTY_MONO_PASS2;
      }
    }
    if (p->config->lmp.lmp_specified != p->config->lmp.lmp2_specified) {
      jlog("WARNING: m_fusion: only -lmp or -lmp2 specified, LM weights may be unbalanced\n");
    }
  }

  /****************************/
  /******* build wchmm ********/
  /****************************/
  if (p->lmtype == LM_DFA) {
    /* execute generation of global grammar and build of wchmm */
    multigram_build(p); /* some modification occured if return TRUE */
  }

  if (p->lmtype == LM_PROB) {
    /* build wchmm with N-gram */
    p->wchmm = wchmm_new();
    p->wchmm->lmtype = p->lmtype;
    p->wchmm->lmvar  = p->lmvar;
    p->wchmm->ccd_flag = p->ccd_flag;
    p->wchmm->category_tree = FALSE;
    p->wchmm->hmmwrk = &(p->am->hmmwrk);
    /* assign models */
    p->wchmm->ngram = p->lm->ngram;
    if (p->lmvar == LM_NGRAM_USER) {
      /* register LM functions for 1st pass here */
      p->wchmm->uni_prob_user = p->lm->lmfunc.uniprob;
      p->wchmm->bi_prob_user = p->lm->lmfunc.biprob;
    }
    p->wchmm->winfo = p->lm->winfo;
    p->wchmm->hmminfo = p->am->hmminfo;
    if (p->wchmm->category_tree) {
      if (p->config->pass1.old_tree_function_flag) {
	if (build_wchmm(p->wchmm, p->lm->config) == FALSE) {
	  jlog("ERROR: m_fusion: error in bulding wchmm\n");
	  return FALSE;
	}
      } else {
	if (build_wchmm2(p->wchmm, p->lm->config) == FALSE) {
	  jlog("ERROR: m_fusion: error in bulding wchmm\n");
	  return FALSE;
	}
      }
    } else {
      if (build_wchmm2(p->wchmm, p->lm->config) == FALSE) {
	jlog("ERROR: m_fusion: error in bulding wchmm\n");
	return FALSE;
      }
    }

    /* 起動時 -check でチェックモードへ */
    if (p->config->sw.wchmm_check_flag) {
      wchmm_check_interactive(p->wchmm);
    }

    /* set beam width */
    /* guess beam width from models, when not specified */
    p->trellis_beam_width = set_beam_width(p->wchmm, p->config->pass1.specified_trellis_beam_width);

    /* initialize cache for factoring */
    max_successor_cache_init(p->wchmm);
  }

  /* backtrellis initialization */
  p->backtrellis = (BACKTRELLIS *)mymalloc(sizeof(BACKTRELLIS));
  bt_init(p->backtrellis);

  /* prepare work area for 2nd pass */
  wchmm_fbs_prepare(p);

  jlog("STAT: SR%02d %s composed\n", sconf->id, sconf->name);

  if (sconf->sw.start_inactive) {
    /* start inactive */
    p->active = -1;
  } else {
    /* book activation for the recognition */
    p->active = 1;
  }
  if (p->lmtype == LM_DFA) {
    if (p->lm->winfo == NULL ||
	(p->lmvar == LM_DFA_GRAMMAR && p->lm->dfa == NULL)) {
      /* make this instance inactive */
      p->active = -1;
    }
  }

  return TRUE;
}