Exemplo n.º 1
0
/** 
 * <JA>
 * 音声波形データから MFCC パラメータを抽出する.
 * 
 * @param speech [in] 音声波形データ
 * @param speechlen [in] @a speech の長さ(単位:サンプル数)
 * 
 * @return 新たに割り付けられ抽出パラメータベクトルが格納されている
 * パラメータ構造体へのポインタを返す.
 * </JA>
 * <EN>
 * Extract MFCC parameters with sentence CMN from given waveform.
 * 
 * @param speech [in] buffer of speech waveform
 * @param speechlen [in] length of @a speech in samples
 * 
 * @return pointer to newly allocated parameter structure data with extracted
 * MFCC vector sequence.
 * </EN>
 */
HTK_Param *new_wav2mfcc(SP16 speech[], int speechlen)
{
  HTK_Param *param;
  int framenum;
  int i;
  int len;

  if (ssload_filename && ssbuf == NULL) {
    /* load noise spectrum for spectral subtraction from file (once) */
    if ((ssbuf = new_SS_load_from_file(ssload_filename, &sslen)) == NULL) {
      j_error("Error: failed to read \"%s\"\n", ssload_filename);
    }
  }

  if (sscalc) {
    /* compute noise spectrum from head silence for each input */
    len = sscalc_len * para.smp_freq / 1000;
    if (len > speechlen) len = speechlen;
#ifdef SSDEBUG
    printf("[%d]\n", len);
#endif
    ssbuf = new_SS_calculate(speech, len, para, &sslen);
  }
#ifdef SSDEBUG
  {
    int i;
    for(i=0;i<sslen;i++) {
      printf("%d: %f\n", i, ssbuf[i]);
    }
  }
#endif
  
  /* calculate frame length from speech length, frame size and frame shift */
  framenum = (int)((speechlen - para.framesize) / para.frameshift) + 1;
  if (framenum < 1) {
    j_printerr("input too short (%d samples), ignored\n", speechlen);
    return NULL;
  }
  
  /* malloc new param */
  param = new_param();
  param->parvec = (VECT **)mymalloc(sizeof(VECT *) * framenum);
  for(i=0;i<framenum;i++) {
    param->parvec[i] = (VECT *)mymalloc(sizeof(VECT) * para.veclen);
  }

  /* make MFCC from speech data */
  Wav2MFCC(speech, param->parvec, para, speechlen, ssbuf, sslen);

  /* set miscellaneous parameters */
  param->header.samplenum = framenum;
  param->header.wshift = para.smp_period * para.frameshift;
  param->header.sampsize = para.veclen * sizeof(VECT); /* not compressed */
  param->header.samptype = F_MFCC;
  if (para.delta) param->header.samptype |= F_DELTA;
  if (para.acc) param->header.samptype |= F_ACCL;
  if (para.energy) param->header.samptype |= F_ENERGY;
  if (para.c0) param->header.samptype |= F_ZEROTH;
  if (para.absesup) param->header.samptype |= F_ENERGY_SUP;
  if (para.cmn) param->header.samptype |= F_CEPNORM;
  param->veclen = para.veclen;
  param->samplenum = framenum;

  return param;
}
Exemplo n.º 2
0
int
main(int argc, char *argv[])
{
  Recog *recog;
  Jconf *jconf;
  float *ss;
  MFCCWork *wrk;

  /* create instance */
  recog = j_recog_new();
  jconf = j_jconf_new();
  recog->jconf = jconf;

  /* set application-specific additional options */
  j_add_option("-freq", 1, 1, "sampling freq in Hz", opt_freq);
  j_add_option("-len", 1, 1, "record length in msec", opt_len);
  j_add_option("-h", 0, 0, "display this help", opt_help);
  j_add_option("-help", 0, 0, "display this help", opt_help);
  j_add_option("--help", 0, 0, "display this help", opt_help);

  /* when no argument, output help and exit */
  if (argc <= 1) {
    opt_help(jconf, NULL, 0);
    return 0;
  }

  /* regard last arg as filename */
  if (strmatch(argv[argc-1], "-")) {
    stout = TRUE;
  } else {
    filename = argv[argc-1];
  }

  /* set default as same as "-input mic" */
  jconf->input.type = INPUT_WAVEFORM;
  jconf->input.speech_input = SP_MIC;
  jconf->input.device = SP_INPUT_DEFAULT;
  /* process config and load them */
  if (j_config_load_args(jconf, argc-1, argv) == -1) {
    fprintf(stderr, "Error reading arguments\n");
    return -1;
  }
  /* force some default values */
  jconf->detect.silence_cut  = 0; /* disable silence cut */
  jconf->preprocess.strip_zero_sample = TRUE; /* strip zero samples */
  jconf->detect.level_thres = 0;	/* no VAD, record all */
  /* set Julius default parameters for unspecified acoustic parameters */
  apply_para(&(jconf->am_root->analysis.para), &(jconf->am_root->analysis.para_default));
  /* set some values */
  jconf->input.sfreq = jconf->am_root->analysis.para.smp_freq;
  jconf->input.period = jconf->am_root->analysis.para.smp_period;
  jconf->input.frameshift = jconf->am_root->analysis.para.frameshift;
  jconf->input.framesize = jconf->am_root->analysis.para.framesize;

  sfreq = jconf->am_root->analysis.para.smp_freq;

  /* output file check */
  if (!stout) {
    if (access(filename, F_OK) == 0) {
      if (access(filename, W_OK) == 0) {
	fprintf(stderr, "Warning: overwriting file \"%s\"\n", filename);
      } else {
	perror("mkss");
	return(1);
      }
    }
  }

  /* allocate speech store buffer */
  samples = sfreq * slen / 1000;
  speech = (SP16 *)mymalloc(sizeof(SP16) * samples);

  /* allocate work area to compute spectrum */
  wrk = WMP_work_new(&(jconf->am_root->analysis.para));
  if (wrk == NULL) {
    jlog("ERROR: m_fusion: failed to initialize MFCC computation for SS\n");
    return -1;
  }

  /* initialize input device */
  if (j_adin_init(recog) == FALSE) {
    fprintf(stderr, "Error in initializing adin device\n");
    return -1;
  }

  /* open device */
  if (j_open_stream(recog, NULL) < 0) {
    fprintf(stderr, "Error in opening adin device\n");
  }

  /* record mic input */
  fprintf(stderr, "%dHz recording for %.2f seconds of noise\n", sfreq, (float)slen /(float)1000);
  speechnum = 0;
  adin_go(adin_callback, NULL, recog);

  /* close device */
  adin_end(recog->adin);
  fprintf(stderr, "\n%d samples (%d bytes, %.1f sec) recorded\n", samples, samples * sizeof(SP16), (float)samples / (float)sfreq);

  /* compute SS */
  fprintf(stderr, "compute SS:\n");
  fprintf(stderr, "  fsize : %4d samples (%.1f msec)\n", jconf->input.framesize, (float)jconf->input.framesize * 1000.0/ (float)sfreq);
  fprintf(stderr, "  fshift: %4d samples (%.1f msec)\n", jconf->input.frameshift, (float)jconf->input.frameshift * 1000.0/ (float)sfreq);

  ss = new_SS_calculate(speech, samples, &sslen, wrk, &(jconf->am_root->analysis.para));

  fprintf(stderr, "  points: %4d\n", sslen);
  fprintf(stderr, "noise spectrum was measured\n");
  
  /* open file for recording */
  fprintf(stderr, "writing average noise spectrum to [%s]...", filename);
  if (stout) {
    fd = 1;
  } else {
    if ((fd = open(filename, O_CREAT | O_RDWR
#ifdef O_BINARY
		   | O_BINARY
#endif
		   , 0644)) == -1) {
      perror("mkss");
      return(1);
    }
  }
  x = sslen;
#ifndef WORDS_BIGENDIAN
  swap_bytes((char *)&x, sizeof(int), 1);
#endif
  if (write(fd, &x, sizeof(int)) < sizeof(int)) {
    perror("mkss");
    return(1);
  }
#ifndef WORDS_BIGENDIAN
  swap_bytes((char *)ss, sizeof(float), sslen);
#endif
  if (write(fd, ss, sslen * sizeof(float)) < sslen * sizeof(float)) {
    perror("mkss");
    return(1);
  }
  if (!stout) {
    if (close(fd) < 0) {
      perror("mkss");
      return(1);
    }
  }
  fprintf(stderr, "done\n");

  WMP_free(wrk);

  return 0;
}
Exemplo n.º 3
0
/** 
 * <JA>
 * 音声波形データから MFCC パラメータを抽出する.
 * エンジンインスタンス内の MFCC 計算インスタンスごとにパラメータ抽出が
 * 行われ,それぞれの mfcc->param に格納される. 
 * 
 * @param speech [in] 音声波形データ
 * @param speechlen [in] @a speech の長さ(単位:サンプル数)
 * @param recog [in] エンジンインスタンス
 * 
 * @return 成功時 TRUE, エラー時 FALSE を返す. 
 * </JA>
 * <EN>
 * Extract MFCC parameters with sentence CMN from given waveform.
 * Parameters will be computed for each MFCC calculation instance
 * in the engine instance, and stored in mfcc->param for each.
 * 
 * @param speech [in] buffer of speech waveform
 * @param speechlen [in] length of @a speech in samples
 * @param recog [in] engine instance
 * 
 * @return TRUE on success, FALSE on error.
 * </EN>
 *
 * @callgraph
 * @callergraph
 */
boolean
wav2mfcc(SP16 speech[], int speechlen, Recog *recog)
{
  int framenum;
  int len;
  Value *para;
  MFCCCalc *mfcc;
  int veclen;
  int t, i;

  /* calculate frame length from speech length, frame size and frame shift */
  framenum = (int)((speechlen - recog->jconf->input.framesize) / recog->jconf->input.frameshift) + 1;
  if (framenum < 1) {
    jlog("WARNING: input too short (%d samples), ignored\n", speechlen);
    return FALSE;
  }

  for(mfcc=recog->mfcclist;mfcc;mfcc=mfcc->next) {

    if (mfcc->frontend.ssload_filename) {
      /* setup for spectral subtraction using file */
      if (mfcc->frontend.ssbuf == NULL) {
	/* load noise spectrum for spectral subtraction from file (once) */
	if ((mfcc->frontend.ssbuf = new_SS_load_from_file(mfcc->frontend.ssload_filename, &(mfcc->frontend.sslen))) == NULL) {
	  jlog("ERROR: wav2mfcc: failed to read noise spectrum from file \"%s\"\n", mfcc->frontend.ssload_filename);
	  return FALSE;
	}
      }
    }

    if (mfcc->frontend.sscalc) {
      /* compute noise spectrum from head silence for each input */
      len = mfcc->frontend.sscalc_len * recog->jconf->input.sfreq / 1000;
      if (len > speechlen) len = speechlen;
#ifdef SSDEBUG
      jlog("DEBUG: [%d]\n", len);
#endif
      mfcc->frontend.ssbuf = new_SS_calculate(speech, len, &(mfcc->frontend.sslen), mfcc->frontend.mfccwrk_ss, mfcc->para);
    }

  }

  /* compute mfcc from speech file for each mfcc instances */
  for(mfcc=recog->mfcclist;mfcc;mfcc=mfcc->next) {

    para = mfcc->para;
    veclen = para->veclen * mfcc->splice;

    if (framenum - (mfcc->splice - 1) < 1) {
      jlog("WARNING: input too short (%d samples), ignored\n", speechlen);
      return FALSE;
    }

    /* malloc new param */
    param_init_content(mfcc->param);
    if (param_alloc(mfcc->param, framenum, veclen) == FALSE) {
      jlog("ERROR: failed to allocate memory for converted parameter vectors\n");
      return FALSE;
    }

    if (mfcc->frontend.ssload_filename || mfcc->frontend.sscalc) {
      /* make link from mfccs to this buffer */
      mfcc->wrk->ssbuf = mfcc->frontend.ssbuf;
      mfcc->wrk->ssbuflen = mfcc->frontend.sslen;
      mfcc->wrk->ss_alpha = mfcc->frontend.ss_alpha;
      mfcc->wrk->ss_floor = mfcc->frontend.ss_floor;
    }
  
    /* make MFCC from speech data */
    if (Wav2MFCC(speech, mfcc->param->parvec, para, speechlen, mfcc->wrk, mfcc->cmn.wrk) == FALSE) {
      jlog("ERROR: failed to compute features from input speech\n");
      if (mfcc->frontend.sscalc) {
	free(mfcc->frontend.ssbuf);
	mfcc->frontend.ssbuf = NULL;
      }
      return FALSE;
    }

    /* splicing */
    if (mfcc->splice > 1) {
      for (t = 0; t < framenum - (mfcc->splice - 1); t++) {
	for (i = 1; i < mfcc->splice; i++) {
	  memcpy(&(mfcc->param->parvec[t][para->veclen * i]), &(mfcc->param->parvec[t + i][0]), sizeof(VECT) * para->veclen);
	}
      }
    }

    /* set miscellaneous parameters */
    mfcc->param->header.samplenum = framenum - (mfcc->splice - 1);
    mfcc->param->header.wshift = para->smp_period * para->frameshift;
    mfcc->param->header.sampsize = veclen * sizeof(VECT); /* not compressed */
    mfcc->param->header.samptype = para->basetype;
    if (para->delta) mfcc->param->header.samptype |= F_DELTA;
    if (para->acc) mfcc->param->header.samptype |= F_ACCL;
    if (para->energy) mfcc->param->header.samptype |= F_ENERGY;
    if (para->c0) mfcc->param->header.samptype |= F_ZEROTH;
    if (para->absesup) mfcc->param->header.samptype |= F_ENERGY_SUP;
    if (para->cmn) mfcc->param->header.samptype |= F_CEPNORM;
    mfcc->param->veclen = veclen;
    mfcc->param->samplenum = framenum - (mfcc->splice - 1);

    if (mfcc->frontend.sscalc) {
      free(mfcc->frontend.ssbuf);
      mfcc->frontend.ssbuf = NULL;
    }
  }

  return TRUE;
}