Ejemplo n.º 1
0
/** 
 * <JA>
 * 音声波形データから MFCC パラメータを抽出する.
 * 
 * @param speech [in] 音声波形データ
 * @param speechlen [in] @a speech の長さ(単位:サンプル数)
 * 
 * @return 新たに割り付けられ抽出パラメータベクトルが格納されている
 * パラメータ構造体へのポインタを返す.
 * </JA>
 * <EN>
 * Extract MFCC parameters with sentence CMN from given waveform.
 * 
 * @param speech [in] buffer of speech waveform
 * @param speechlen [in] length of @a speech in samples
 * 
 * @return pointer to newly allocated parameter structure data with extracted
 * MFCC vector sequence.
 * </EN>
 */
HTK_Param *new_wav2mfcc(SP16 speech[], int speechlen)
{
  HTK_Param *param;
  int framenum;
  int i;
  int len;

  if (ssload_filename && ssbuf == NULL) {
    /* load noise spectrum for spectral subtraction from file (once) */
    if ((ssbuf = new_SS_load_from_file(ssload_filename, &sslen)) == NULL) {
      j_error("Error: failed to read \"%s\"\n", ssload_filename);
    }
  }

  if (sscalc) {
    /* compute noise spectrum from head silence for each input */
    len = sscalc_len * para.smp_freq / 1000;
    if (len > speechlen) len = speechlen;
#ifdef SSDEBUG
    printf("[%d]\n", len);
#endif
    ssbuf = new_SS_calculate(speech, len, para, &sslen);
  }
#ifdef SSDEBUG
  {
    int i;
    for(i=0;i<sslen;i++) {
      printf("%d: %f\n", i, ssbuf[i]);
    }
  }
#endif
  
  /* calculate frame length from speech length, frame size and frame shift */
  framenum = (int)((speechlen - para.framesize) / para.frameshift) + 1;
  if (framenum < 1) {
    j_printerr("input too short (%d samples), ignored\n", speechlen);
    return NULL;
  }
  
  /* malloc new param */
  param = new_param();
  param->parvec = (VECT **)mymalloc(sizeof(VECT *) * framenum);
  for(i=0;i<framenum;i++) {
    param->parvec[i] = (VECT *)mymalloc(sizeof(VECT) * para.veclen);
  }

  /* make MFCC from speech data */
  Wav2MFCC(speech, param->parvec, para, speechlen, ssbuf, sslen);

  /* set miscellaneous parameters */
  param->header.samplenum = framenum;
  param->header.wshift = para.smp_period * para.frameshift;
  param->header.sampsize = para.veclen * sizeof(VECT); /* not compressed */
  param->header.samptype = F_MFCC;
  if (para.delta) param->header.samptype |= F_DELTA;
  if (para.acc) param->header.samptype |= F_ACCL;
  if (para.energy) param->header.samptype |= F_ENERGY;
  if (para.c0) param->header.samptype |= F_ZEROTH;
  if (para.absesup) param->header.samptype |= F_ENERGY_SUP;
  if (para.cmn) param->header.samptype |= F_CEPNORM;
  param->veclen = para.veclen;
  param->samplenum = framenum;

  return param;
}
Ejemplo n.º 2
0
/** 
 * <JA>
 * 音声波形データから MFCC パラメータを抽出する.
 * エンジンインスタンス内の MFCC 計算インスタンスごとにパラメータ抽出が
 * 行われ,それぞれの mfcc->param に格納される. 
 * 
 * @param speech [in] 音声波形データ
 * @param speechlen [in] @a speech の長さ(単位:サンプル数)
 * @param recog [in] エンジンインスタンス
 * 
 * @return 成功時 TRUE, エラー時 FALSE を返す. 
 * </JA>
 * <EN>
 * Extract MFCC parameters with sentence CMN from given waveform.
 * Parameters will be computed for each MFCC calculation instance
 * in the engine instance, and stored in mfcc->param for each.
 * 
 * @param speech [in] buffer of speech waveform
 * @param speechlen [in] length of @a speech in samples
 * @param recog [in] engine instance
 * 
 * @return TRUE on success, FALSE on error.
 * </EN>
 *
 * @callgraph
 * @callergraph
 */
boolean
wav2mfcc(SP16 speech[], int speechlen, Recog *recog)
{
  int framenum;
  int len;
  Value *para;
  MFCCCalc *mfcc;
  int veclen;
  int t, i;

  /* calculate frame length from speech length, frame size and frame shift */
  framenum = (int)((speechlen - recog->jconf->input.framesize) / recog->jconf->input.frameshift) + 1;
  if (framenum < 1) {
    jlog("WARNING: input too short (%d samples), ignored\n", speechlen);
    return FALSE;
  }

  for(mfcc=recog->mfcclist;mfcc;mfcc=mfcc->next) {

    if (mfcc->frontend.ssload_filename) {
      /* setup for spectral subtraction using file */
      if (mfcc->frontend.ssbuf == NULL) {
	/* load noise spectrum for spectral subtraction from file (once) */
	if ((mfcc->frontend.ssbuf = new_SS_load_from_file(mfcc->frontend.ssload_filename, &(mfcc->frontend.sslen))) == NULL) {
	  jlog("ERROR: wav2mfcc: failed to read noise spectrum from file \"%s\"\n", mfcc->frontend.ssload_filename);
	  return FALSE;
	}
      }
    }

    if (mfcc->frontend.sscalc) {
      /* compute noise spectrum from head silence for each input */
      len = mfcc->frontend.sscalc_len * recog->jconf->input.sfreq / 1000;
      if (len > speechlen) len = speechlen;
#ifdef SSDEBUG
      jlog("DEBUG: [%d]\n", len);
#endif
      mfcc->frontend.ssbuf = new_SS_calculate(speech, len, &(mfcc->frontend.sslen), mfcc->frontend.mfccwrk_ss, mfcc->para);
    }

  }

  /* compute mfcc from speech file for each mfcc instances */
  for(mfcc=recog->mfcclist;mfcc;mfcc=mfcc->next) {

    para = mfcc->para;
    veclen = para->veclen * mfcc->splice;

    if (framenum - (mfcc->splice - 1) < 1) {
      jlog("WARNING: input too short (%d samples), ignored\n", speechlen);
      return FALSE;
    }

    /* malloc new param */
    param_init_content(mfcc->param);
    if (param_alloc(mfcc->param, framenum, veclen) == FALSE) {
      jlog("ERROR: failed to allocate memory for converted parameter vectors\n");
      return FALSE;
    }

    if (mfcc->frontend.ssload_filename || mfcc->frontend.sscalc) {
      /* make link from mfccs to this buffer */
      mfcc->wrk->ssbuf = mfcc->frontend.ssbuf;
      mfcc->wrk->ssbuflen = mfcc->frontend.sslen;
      mfcc->wrk->ss_alpha = mfcc->frontend.ss_alpha;
      mfcc->wrk->ss_floor = mfcc->frontend.ss_floor;
    }
  
    /* make MFCC from speech data */
    if (Wav2MFCC(speech, mfcc->param->parvec, para, speechlen, mfcc->wrk, mfcc->cmn.wrk) == FALSE) {
      jlog("ERROR: failed to compute features from input speech\n");
      if (mfcc->frontend.sscalc) {
	free(mfcc->frontend.ssbuf);
	mfcc->frontend.ssbuf = NULL;
      }
      return FALSE;
    }

    /* splicing */
    if (mfcc->splice > 1) {
      for (t = 0; t < framenum - (mfcc->splice - 1); t++) {
	for (i = 1; i < mfcc->splice; i++) {
	  memcpy(&(mfcc->param->parvec[t][para->veclen * i]), &(mfcc->param->parvec[t + i][0]), sizeof(VECT) * para->veclen);
	}
      }
    }

    /* set miscellaneous parameters */
    mfcc->param->header.samplenum = framenum - (mfcc->splice - 1);
    mfcc->param->header.wshift = para->smp_period * para->frameshift;
    mfcc->param->header.sampsize = veclen * sizeof(VECT); /* not compressed */
    mfcc->param->header.samptype = para->basetype;
    if (para->delta) mfcc->param->header.samptype |= F_DELTA;
    if (para->acc) mfcc->param->header.samptype |= F_ACCL;
    if (para->energy) mfcc->param->header.samptype |= F_ENERGY;
    if (para->c0) mfcc->param->header.samptype |= F_ZEROTH;
    if (para->absesup) mfcc->param->header.samptype |= F_ENERGY_SUP;
    if (para->cmn) mfcc->param->header.samptype |= F_CEPNORM;
    mfcc->param->veclen = veclen;
    mfcc->param->samplenum = framenum - (mfcc->splice - 1);

    if (mfcc->frontend.sscalc) {
      free(mfcc->frontend.ssbuf);
      mfcc->frontend.ssbuf = NULL;
    }
  }

  return TRUE;
}