static void feat_s3_cepwin(feat_t * fcb, mfcc_t ** mfc, mfcc_t ** feat) { assert(fcb); assert(feat_n_stream(fcb) == 1); /* CEP */ memcpy(feat[0], mfc[ -feat_window_size(fcb)], (1 + 2 * feat_window_size (fcb)) * feat_cepsize(fcb) * sizeof(mfcc_t)); }
static void feat_s3_cep_dcep(feat_t * fcb, mfcc_t ** mfc, mfcc_t ** feat) { mfcc_t *f; mfcc_t *w, *_w; int32 i; assert(fcb); assert(feat_n_stream(fcb) == 1); assert(feat_stream_len(fcb, 0) == feat_cepsize(fcb) * 2); assert(feat_window_size(fcb) == 2); /* CEP */ memcpy(feat[0], mfc[0], feat_cepsize(fcb) * sizeof(mfcc_t)); /* * DCEP: mfc[2] - mfc[-2]; */ f = feat[0] + feat_cepsize(fcb); w = mfc[2]; _w = mfc[-2]; for (i = 0; i < feat_cepsize(fcb); i++) f[i] = w[i] - _w[i]; }
static int32 feat_s2mfc2feat_block_utt(feat_t * fcb, mfcc_t ** uttcep, int32 nfr, mfcc_t *** ofeat) { mfcc_t **cepbuf; int32 i, win, cepsize; win = feat_window_size(fcb); cepsize = feat_cepsize(fcb); /* Copy and pad out the utterance (this requires that the * feature computation functions always access the buffer via * the frame pointers, which they do) */ cepbuf = (mfcc_t**) ckd_calloc(nfr + win * 2, sizeof(mfcc_t *)); memcpy(cepbuf + win, uttcep, nfr * sizeof(mfcc_t *)); /* Do normalization before we interpolate on the boundary */ feat_cmn(fcb, cepbuf + win, nfr, 1, 1); feat_agc(fcb, cepbuf + win, nfr, 1, 1); /* Now interpolate */ for (i = 0; i < win; ++i) { cepbuf[i] = fcb->cepbuf[i]; memcpy(cepbuf[i], uttcep[0], cepsize * sizeof(mfcc_t)); cepbuf[nfr + win + i] = fcb->cepbuf[win + i]; memcpy(cepbuf[nfr + win + i], uttcep[nfr - 1], cepsize * sizeof(mfcc_t)); } /* Compute as usual. */ feat_compute_utt(fcb, cepbuf, nfr + win * 2, win, ofeat); ckd_free(cepbuf); return nfr; }
static void feat_1s_c_d_ld_dd_cep2feat(feat_t * fcb, mfcc_t ** mfc, mfcc_t ** feat) { mfcc_t *f; mfcc_t *w, *_w; mfcc_t *w1, *w_1, *_w1, *_w_1; mfcc_t d1, d2; int32 i; assert(fcb); assert(feat_n_stream(fcb) == 1); assert(feat_stream_len(fcb, 0) == feat_cepsize(fcb) * 4); assert(feat_window_size(fcb) == FEAT_DCEP_WIN * 2); /* CEP */ memcpy(feat[0], mfc[0], feat_cepsize(fcb) * sizeof(mfcc_t)); /* * DCEP: mfc[w] - mfc[-w], where w = FEAT_DCEP_WIN; */ f = feat[0] + feat_cepsize(fcb); w = mfc[FEAT_DCEP_WIN]; _w = mfc[-FEAT_DCEP_WIN]; for (i = 0; i < feat_cepsize(fcb); i++) f[i] = w[i] - _w[i]; /* * LDCEP: mfc[w] - mfc[-w], where w = FEAT_DCEP_WIN * 2; */ f += feat_cepsize(fcb); w = mfc[FEAT_DCEP_WIN * 2]; _w = mfc[-FEAT_DCEP_WIN * 2]; for (i = 0; i < feat_cepsize(fcb); i++) f[i] = w[i] - _w[i]; /* * D2CEP: (mfc[w+1] - mfc[-w+1]) - (mfc[w-1] - mfc[-w-1]), * where w = FEAT_DCEP_WIN */ f += feat_cepsize(fcb); w1 = mfc[FEAT_DCEP_WIN + 1]; _w1 = mfc[-FEAT_DCEP_WIN + 1]; w_1 = mfc[FEAT_DCEP_WIN - 1]; _w_1 = mfc[-FEAT_DCEP_WIN - 1]; for (i = 0; i < feat_cepsize(fcb); i++) { d1 = w1[i] - _w1[i]; d2 = w_1[i] - _w_1[i]; f[i] = d1 - d2; } }
static void feat_s3_1x39_cep2feat(feat_t * fcb, mfcc_t ** mfc, mfcc_t ** feat) { mfcc_t *f; mfcc_t *w, *_w; mfcc_t *w1, *w_1, *_w1, *_w_1; mfcc_t d1, d2; int32 i; assert(fcb); assert(feat_cepsize(fcb) == 13); assert(feat_n_stream(fcb) == 1); assert(feat_stream_len(fcb, 0) == 39); assert(feat_window_size(fcb) == 3); /* CEP; skip C0 */ memcpy(feat[0], mfc[0] + 1, (feat_cepsize(fcb) - 1) * sizeof(mfcc_t)); /* * DCEP: mfc[2] - mfc[-2]; */ f = feat[0] + feat_cepsize(fcb) - 1; w = mfc[2] + 1; /* +1 to skip C0 */ _w = mfc[-2] + 1; for (i = 0; i < feat_cepsize(fcb) - 1; i++) f[i] = w[i] - _w[i]; /* POW: C0, DC0, D2C0 */ f += feat_cepsize(fcb) - 1; f[0] = mfc[0][0]; f[1] = mfc[2][0] - mfc[-2][0]; d1 = mfc[3][0] - mfc[-1][0]; d2 = mfc[1][0] - mfc[-3][0]; f[2] = d1 - d2; /* D2CEP: (mfc[3] - mfc[-1]) - (mfc[1] - mfc[-3]) */ f += 3; w1 = mfc[3] + 1; /* Final +1 to skip C0 */ _w1 = mfc[-1] + 1; w_1 = mfc[1] + 1; _w_1 = mfc[-3] + 1; for (i = 0; i < feat_cepsize(fcb) - 1; i++) { d1 = w1[i] - _w1[i]; d2 = w_1[i] - _w_1[i]; f[i] = d1 - d2; } }
static void feat_copy(feat_t * fcb, mfcc_t ** mfc, mfcc_t ** feat) { int32 win, i, j; win = feat_window_size(fcb); /* Concatenate input features */ for (i = -win; i <= win; ++i) { uint32 spos = 0; for (j = 0; j < feat_n_stream(fcb); ++j) { uint32 stream_len; /* Unscale the stream length by the window. */ stream_len = feat_stream_len(fcb, j) / (2 * win + 1); memcpy(feat[j] + ((i + win) * stream_len), mfc[i] + spos, stream_len * sizeof(mfcc_t)); spos += stream_len; } } }
feat_t * feat_init(char const *type, cmn_type_t cmn, int32 varnorm, agc_type_t agc, int32 breport, int32 cepsize) { feat_t *fcb; if (cepsize == 0) cepsize = 13; if (breport) E_INFO ("Initializing feature stream to type: '%s', ceplen=%d, CMN='%s', VARNORM='%s', AGC='%s'\n", type, cepsize, cmn_type_str[cmn], varnorm ? "yes" : "no", agc_type_str[agc]); fcb = (feat_t *) ckd_calloc(1, sizeof(feat_t)); fcb->refcount = 1; fcb->name = (char *) ckd_salloc(type); if (strcmp(type, "s2_4x") == 0) { /* Sphinx-II format 4-stream feature (Hack!! hardwired constants below) */ if (cepsize != 13) { E_ERROR("s2_4x features require cepsize == 13\n"); ckd_free(fcb); return 0; } fcb->cepsize = 13; fcb->n_stream = 4; fcb->stream_len = (uint32 *) ckd_calloc(4, sizeof(uint32)); fcb->stream_len[0] = 12; fcb->stream_len[1] = 24; fcb->stream_len[2] = 3; fcb->stream_len[3] = 12; fcb->out_dim = 51; fcb->window_size = 4; fcb->compute_feat = feat_s2_4x_cep2feat; } else if ((strcmp(type, "s3_1x39") == 0) || (strcmp(type, "1s_12c_12d_3p_12dd") == 0)) { /* 1-stream cep/dcep/pow/ddcep (Hack!! hardwired constants below) */ if (cepsize != 13) { E_ERROR("s2_4x features require cepsize == 13\n"); ckd_free(fcb); return 0; } fcb->cepsize = 13; fcb->n_stream = 1; fcb->stream_len = (uint32 *) ckd_calloc(1, sizeof(uint32)); fcb->stream_len[0] = 39; fcb->out_dim = 39; fcb->window_size = 3; fcb->compute_feat = feat_s3_1x39_cep2feat; } else if (strncmp(type, "1s_c_d_dd", 9) == 0) { fcb->cepsize = cepsize; fcb->n_stream = 1; fcb->stream_len = (uint32 *) ckd_calloc(1, sizeof(uint32)); fcb->stream_len[0] = cepsize * 3; fcb->out_dim = cepsize * 3; fcb->window_size = FEAT_DCEP_WIN + 1; /* ddcep needs the extra 1 */ fcb->compute_feat = feat_1s_c_d_dd_cep2feat; } else if (strncmp(type, "1s_c_d_ld_dd", 12) == 0) { fcb->cepsize = cepsize; fcb->n_stream = 1; fcb->stream_len = (uint32 *) ckd_calloc(1, sizeof(uint32)); fcb->stream_len[0] = cepsize * 4; fcb->out_dim = cepsize * 4; fcb->window_size = FEAT_DCEP_WIN * 2; fcb->compute_feat = feat_1s_c_d_ld_dd_cep2feat; } else if (strncmp(type, "cep_dcep", 8) == 0 || strncmp(type, "1s_c_d", 6) == 0) { /* 1-stream cep/dcep */ fcb->cepsize = cepsize; fcb->n_stream = 1; fcb->stream_len = (uint32 *) ckd_calloc(1, sizeof(uint32)); fcb->stream_len[0] = feat_cepsize(fcb) * 2; fcb->out_dim = fcb->stream_len[0]; fcb->window_size = 2; fcb->compute_feat = feat_s3_cep_dcep; } else if (strncmp(type, "cep", 3) == 0 || strncmp(type, "1s_c", 4) == 0) { /* 1-stream cep */ fcb->cepsize = cepsize; fcb->n_stream = 1; fcb->stream_len = (uint32 *) ckd_calloc(1, sizeof(uint32)); fcb->stream_len[0] = feat_cepsize(fcb); fcb->out_dim = fcb->stream_len[0]; fcb->window_size = 0; fcb->compute_feat = feat_s3_cep; } else if (strncmp(type, "1s_3c", 5) == 0 || strncmp(type, "1s_4c", 5) == 0) { /* 1-stream cep with frames concatenated, so called cepwin features */ if (strncmp(type, "1s_3c", 5) == 0) fcb->window_size = 3; else fcb->window_size = 4; fcb->cepsize = cepsize; fcb->n_stream = 1; fcb->stream_len = (uint32 *) ckd_calloc(1, sizeof(uint32)); fcb->stream_len[0] = feat_cepsize(fcb) * (2 * fcb->window_size + 1); fcb->out_dim = fcb->stream_len[0]; fcb->compute_feat = feat_copy; } else { int32 i, l, k; char *strp; char *mtype = ckd_salloc(type); char *wd = ckd_salloc(type); /* * Generic definition: Format should be %d,%d,%d,...,%d (i.e., * comma separated list of feature stream widths; #items = * #streams). An optional window size (frames will be * concatenated) is also allowed, which can be specified with * a colon after the list of feature streams. */ l = strlen(mtype); k = 0; for (i = 1; i < l - 1; i++) { if (mtype[i] == ',') { mtype[i] = ' '; k++; } else if (mtype[i] == ':') { mtype[i] = '\0'; fcb->window_size = atoi(mtype + i + 1); break; } } k++; /* Presumably there are (#commas+1) streams */ fcb->n_stream = k; fcb->stream_len = (uint32 *) ckd_calloc(k, sizeof(uint32)); /* Scan individual feature stream lengths */ strp = mtype; i = 0; fcb->out_dim = 0; fcb->cepsize = 0; #ifndef POCKETSPHINX_NET while (sscanf(strp, "%s%n", wd, &l) == 1) #else while (net_sscanf_word(strp, wd, &l) == 1) #endif { strp += l; if ((i >= fcb->n_stream) || #ifndef POCKETSPHINX_NET (sscanf(wd, "%d", &(fcb->stream_len[i])) != 1) #else UInt32::TryParse(gcnew String(wd), fcb->stream_len[i]) #endif || (fcb->stream_len[i] <= 0)) E_FATAL("Bad feature type argument\n"); /* Input size before windowing */ fcb->cepsize += fcb->stream_len[i]; if (fcb->window_size > 0) fcb->stream_len[i] *= (fcb->window_size * 2 + 1); /* Output size after windowing */ fcb->out_dim += fcb->stream_len[i]; i++; } if (i != fcb->n_stream) E_FATAL("Bad feature type argument\n"); if (fcb->cepsize != cepsize) E_FATAL("Bad feature type argument\n"); /* Input is already the feature stream */ fcb->compute_feat = feat_copy; ckd_free(mtype); ckd_free(wd); } if (cmn != CMN_NONE) fcb->cmn_struct = cmn_init(feat_cepsize(fcb)); fcb->cmn = cmn; fcb->varnorm = varnorm; if (agc != AGC_NONE) { fcb->agc_struct = agc_init(); /* * No need to check if agc is set to EMAX; agc_emax_set() changes only emax related things * Moreover, if agc is not NONE and block mode is used, feat_agc() SILENTLY * switches to EMAX */ /* HACK: hardwired initial estimates based on use of CMN (from Sphinx2) */ agc_emax_set(fcb->agc_struct, (cmn != CMN_NONE) ? 5.0 : 10.0); } fcb->agc = agc; /* * Make sure this buffer is large enough to be used in feat_s2mfc2feat_block_utt() */ fcb->cepbuf = (mfcc_t **) ckd_calloc_2d((LIVEBUFBLOCKSIZE < feat_window_size(fcb) * 2) ? feat_window_size(fcb) * 2 : LIVEBUFBLOCKSIZE, feat_cepsize(fcb), sizeof(mfcc_t)); /* This one is actually just an array of pointers to "flatten out" * wraparounds. */ fcb->tmpcepbuf = (mfcc_t**)ckd_calloc(2 * feat_window_size(fcb) + 1, sizeof(*fcb->tmpcepbuf)); return fcb; }
static void feat_s2_4x_cep2feat(feat_t * fcb, mfcc_t ** mfc, mfcc_t ** feat) { mfcc_t *f; mfcc_t *w, *_w; mfcc_t *w1, *w_1, *_w1, *_w_1; mfcc_t d1, d2; int32 i, j; assert(fcb); assert(feat_cepsize(fcb) == 13); assert(feat_n_stream(fcb) == 4); assert(feat_stream_len(fcb, 0) == 12); assert(feat_stream_len(fcb, 1) == 24); assert(feat_stream_len(fcb, 2) == 3); assert(feat_stream_len(fcb, 3) == 12); assert(feat_window_size(fcb) == 4); /* CEP; skip C0 */ memcpy(feat[0], mfc[0] + 1, (feat_cepsize(fcb) - 1) * sizeof(mfcc_t)); /* * DCEP(SHORT): mfc[2] - mfc[-2] * DCEP(LONG): mfc[4] - mfc[-4] */ w = mfc[2] + 1; /* +1 to skip C0 */ _w = mfc[-2] + 1; f = feat[1]; for (i = 0; i < feat_cepsize(fcb) - 1; i++) /* Short-term */ f[i] = w[i] - _w[i]; w = mfc[4] + 1; /* +1 to skip C0 */ _w = mfc[-4] + 1; for (j = 0; j < feat_cepsize(fcb) - 1; i++, j++) /* Long-term */ f[i] = w[j] - _w[j]; /* D2CEP: (mfc[3] - mfc[-1]) - (mfc[1] - mfc[-3]) */ w1 = mfc[3] + 1; /* Final +1 to skip C0 */ _w1 = mfc[-1] + 1; w_1 = mfc[1] + 1; _w_1 = mfc[-3] + 1; f = feat[3]; for (i = 0; i < feat_cepsize(fcb) - 1; i++) { d1 = w1[i] - _w1[i]; d2 = w_1[i] - _w_1[i]; f[i] = d1 - d2; } /* POW: C0, DC0, D2C0; differences computed as above for rest of cep */ f = feat[2]; f[0] = mfc[0][0]; f[1] = mfc[2][0] - mfc[-2][0]; d1 = mfc[3][0] - mfc[-1][0]; d2 = mfc[1][0] - mfc[-3][0]; f[2] = d1 - d2; }
int32 feat_s2mfc2feat_live(feat_t * fcb, mfcc_t ** uttcep, int32 *inout_ncep, int32 beginutt, int32 endutt, mfcc_t *** ofeat) { int32 win, cepsize, nbufcep; int32 i, j, nfeatvec; int32 zero = 0; /* Avoid having to check this everywhere. */ if (inout_ncep == 0) inout_ncep = &zero; /* Special case for entire utterances. */ if (beginutt && endutt && *inout_ncep > 0) return feat_s2mfc2feat_block_utt(fcb, uttcep, *inout_ncep, ofeat); win = feat_window_size(fcb); cepsize = feat_cepsize(fcb); /* Empty the input buffer on start of utterance. */ if (beginutt) fcb->bufpos = fcb->curpos; /* Calculate how much data is in the buffer already. */ nbufcep = fcb->bufpos - fcb->curpos; if (nbufcep < 0) nbufcep = fcb->bufpos + LIVEBUFBLOCKSIZE - fcb->curpos; /* Add any data that we have to replicate. */ if (beginutt && *inout_ncep > 0) nbufcep += win; if (endutt) nbufcep += win; /* Only consume as much input as will fit in the buffer. */ if (nbufcep + *inout_ncep > LIVEBUFBLOCKSIZE) { /* We also can't overwrite the trailing window, hence the * reason why win is subtracted here. */ *inout_ncep = LIVEBUFBLOCKSIZE - nbufcep - win; /* Cancel end of utterance processing. */ endutt = FALSE; } /* FIXME: Don't modify the input! */ feat_cmn(fcb, uttcep, *inout_ncep, beginutt, endutt); feat_agc(fcb, uttcep, *inout_ncep, beginutt, endutt); /* Replicate first frame into the first win frames if we're at the * beginning of the utterance and there was some actual input to * deal with. (FIXME: Not entirely sure why that condition) */ if (beginutt && *inout_ncep > 0) { for (i = 0; i < win; i++) { memcpy(fcb->cepbuf[fcb->bufpos++], uttcep[0], cepsize * sizeof(mfcc_t)); fcb->bufpos %= LIVEBUFBLOCKSIZE; } /* Move the current pointer past this data. */ fcb->curpos = fcb->bufpos; nbufcep -= win; } /* Copy in frame data to the circular buffer. */ for (i = 0; i < *inout_ncep; ++i) { memcpy(fcb->cepbuf[fcb->bufpos++], uttcep[i], cepsize * sizeof(mfcc_t)); fcb->bufpos %= LIVEBUFBLOCKSIZE; ++nbufcep; } /* Replicate last frame into the last win frames if we're at the * end of the utterance (even if there was no input, so we can * flush the output). */ if (endutt) { int32 tpos; /* Index of last input frame. */ if (fcb->bufpos == 0) tpos = LIVEBUFBLOCKSIZE - 1; else tpos = fcb->bufpos - 1; for (i = 0; i < win; ++i) { memcpy(fcb->cepbuf[fcb->bufpos++], fcb->cepbuf[tpos], cepsize * sizeof(mfcc_t)); fcb->bufpos %= LIVEBUFBLOCKSIZE; } } /* We have to leave the trailing window of frames. */ nfeatvec = nbufcep - win; if (nfeatvec <= 0) return 0; /* Do nothing. */ for (i = 0; i < nfeatvec; ++i) { /* Handle wraparound cases. */ if (fcb->curpos - win < 0 || fcb->curpos + win >= LIVEBUFBLOCKSIZE) { /* Use tmpcepbuf for this case. Actually, we just need the pointers. */ for (j = -win; j <= win; ++j) { int32 tmppos = (fcb->curpos + j + LIVEBUFBLOCKSIZE) % LIVEBUFBLOCKSIZE; fcb->tmpcepbuf[win + j] = fcb->cepbuf[tmppos]; } fcb->compute_feat(fcb, fcb->tmpcepbuf + win, ofeat[i]); } else { fcb->compute_feat(fcb, fcb->cepbuf + fcb->curpos, ofeat[i]); } /* Move the read pointer forward. */ ++fcb->curpos; fcb->curpos %= LIVEBUFBLOCKSIZE; } if (fcb->lda) feat_lda_transform(fcb, ofeat, nfeatvec); if (fcb->subvecs) feat_subvec_project(fcb, ofeat, nfeatvec); return nfeatvec; }
int32 feat_s2mfc2feat(feat_t * fcb, const char *file, const char *dir, const char *cepext, int32 sf, int32 ef, mfcc_t *** feat, int32 maxfr) { char *path; char *ps = "/"; int32 win, nfr; int32 file_length, cepext_length, path_length = 0; mfcc_t **mfc; if (fcb->cepsize <= 0) { E_ERROR("Bad cepsize: %d\n", fcb->cepsize); return -1; } if (cepext == 0) cepext = ""; /* * Create mfc filename, combining file, dir and extension if * necessary */ /* * First we decide about the path. If dir is defined, then use * it. Otherwise assume the filename already contains the path. */ if (dir == 0) { dir = ""; ps = ""; /* * This is not true but some 3rd party apps * may parse the output explicitly checking for this line */ E_INFO("At directory . (current directory)\n"); } else { E_INFO("At directory %s\n", dir); /* * Do not forget the path separator! */ path_length += strlen(dir) + 1; } /* * Include cepext, if it's not already part of the filename. */ file_length = strlen(file); cepext_length = strlen(cepext); if ((file_length > cepext_length) && (strcmp(file + file_length - cepext_length, cepext) == 0)) { cepext = ""; cepext_length = 0; } /* * Do not forget the '\0' */ path_length += file_length + cepext_length + 1; path = (char*) ckd_calloc(path_length, sizeof(char)); #ifdef HAVE_SNPRINTF /* * Paranoia is our best friend... */ while ((file_length = snprintf(path, path_length, "%s%s%s%s", dir, ps, file, cepext)) > path_length) { path_length = file_length; path = (char*) ckd_realloc(path, path_length * sizeof(char)); } #else #ifndef POCKETSPHINX_NET sprintf(path, "%s%s%s%s", dir, ps, file, cepext); #else strcpy(path,dir); strcat(path,ps); strcat(path,file); strcat(path,cepext); #endif #endif win = feat_window_size(fcb); /* Pad maxfr with win, so we read enough raw feature data to * calculate the requisite number of dynamic features. */ if (maxfr >= 0) maxfr += win * 2; if (feat != 0) { /* Read mfc file including window or padding if necessary. */ nfr = feat_s2mfc_read_norm_pad(fcb, path, win, sf, ef, &mfc, maxfr, fcb->cepsize); ckd_free(path); if (nfr < 0) { ckd_free_2d((void **) mfc); return -1; } /* Actually compute the features */ feat_compute_utt(fcb, mfc, nfr, win, feat); ckd_free_2d((void **) mfc); } else { /* Just calculate the number of frames we would need. */ nfr = feat_s2mfc_read_norm_pad(fcb, path, win, sf, ef, 0, maxfr, fcb->cepsize); ckd_free(path); if (nfr < 0) return nfr; } return (nfr - win * 2); }
int acmod_process_cep(acmod_t *acmod, mfcc_t ***inout_cep, int *inout_n_frames, int full_utt) { int32 nfeat, ncep, inptr; int orig_n_frames; /* If this is a full utterance, process it all at once. */ if (full_utt) return acmod_process_full_cep(acmod, inout_cep, inout_n_frames); /* Write to log file. */ if (acmod->mfcfh) acmod_log_mfc(acmod, *inout_cep, *inout_n_frames); /* Maximum number of frames we're going to generate. */ orig_n_frames = ncep = nfeat = *inout_n_frames; /* FIXME: This behaviour isn't guaranteed... */ if (acmod->state == ACMOD_ENDED) nfeat += feat_window_size(acmod->fcb); else if (acmod->state == ACMOD_STARTED) nfeat -= feat_window_size(acmod->fcb); /* Clamp number of features to fit available space. */ if (nfeat > acmod->n_feat_alloc - acmod->n_feat_frame) { /* Grow it as needed - we have to grow it at the end of an * utterance because we can't return a short read there. */ if (acmod->grow_feat || acmod->state == ACMOD_ENDED) acmod_grow_feat_buf(acmod, acmod->n_feat_alloc + nfeat); else ncep -= (nfeat - (acmod->n_feat_alloc - acmod->n_feat_frame)); } /* Where to start writing in the feature buffer. */ if (acmod->grow_feat) { /* Grow to avoid wraparound if grow_feat == TRUE. */ inptr = acmod->feat_outidx + acmod->n_feat_frame; while (inptr + nfeat >= acmod->n_feat_alloc) acmod_grow_feat_buf(acmod, acmod->n_feat_alloc * 2); } else { inptr = (acmod->feat_outidx + acmod->n_feat_frame) % acmod->n_feat_alloc; } /* FIXME: we can't split the last frame drop properly to be on the bounary, * so just return */ if (inptr + nfeat > acmod->n_feat_alloc && acmod->state == ACMOD_ENDED) { *inout_n_frames -= ncep; *inout_cep += ncep; return 0; } /* Write them in two parts if there is wraparound. */ if (inptr + nfeat > acmod->n_feat_alloc) { int32 ncep1 = acmod->n_feat_alloc - inptr; /* Make sure we don't end the utterance here. */ nfeat = feat_s2mfc2feat_live(acmod->fcb, *inout_cep, &ncep1, (acmod->state == ACMOD_STARTED), FALSE, acmod->feat_buf + inptr); if (nfeat < 0) return -1; /* Move the output feature pointer forward. */ acmod->n_feat_frame += nfeat; assert(acmod->n_feat_frame <= acmod->n_feat_alloc); inptr += nfeat; inptr %= acmod->n_feat_alloc; /* Move the input feature pointers forward. */ *inout_n_frames -= ncep1; *inout_cep += ncep1; ncep -= ncep1; } nfeat = feat_s2mfc2feat_live(acmod->fcb, *inout_cep, &ncep, (acmod->state == ACMOD_STARTED), (acmod->state == ACMOD_ENDED), acmod->feat_buf + inptr); if (nfeat < 0) return -1; acmod->n_feat_frame += nfeat; assert(acmod->n_feat_frame <= acmod->n_feat_alloc); /* Move the input feature pointers forward. */ *inout_n_frames -= ncep; *inout_cep += ncep; if (acmod->state == ACMOD_STARTED) acmod->state = ACMOD_PROCESSING; return orig_n_frames - *inout_n_frames; }
int agg_phn_seg(lexicon_t *lex, acmod_set_t *acmod_set, feat_t *fcb, segdmp_type_t type) { uint16 *seg; vector_t *mfcc; vector_t **feat; int32 n_frame; uint32 tick_cnt; acmod_id_t *phone; uint32 *start; uint32 *len; uint32 n_phone; uint32 s; char *btw_mark; char *trans; char **word; uint32 n_word; int32 mfc_veclen = cmd_ln_int32("-ceplen"); uint32 n_stream; uint32 *veclen; tick_cnt = 0; n_stream = feat_dimension1(fcb); veclen = feat_stream_lengths(fcb); while (corpus_next_utt()) { if ((++tick_cnt % 500) == 0) { E_INFOCONT("[%u] ", tick_cnt); } if (corpus_get_sent(&trans) != S3_SUCCESS) { E_FATAL("Unable to read word transcript for %s\n", corpus_utt_brief_name()); } if (corpus_get_seg(&seg, &n_frame) != S3_SUCCESS) { E_FATAL("Unable to read Viterbi state segmentation for %s\n", corpus_utt_brief_name()); } n_word = str2words(trans, NULL, 0); word = ckd_calloc(n_word, sizeof(char*)); str2words(trans, word, n_word); phone = mk_phone_list(&btw_mark, &n_phone, word, n_word, lex); start = ckd_calloc(n_phone, sizeof(uint32)); len = ckd_calloc(n_phone, sizeof(uint32)); /* check to see whether the word transcript and dictionary entries agree with the state segmentation */ if (ck_seg(acmod_set, phone, n_phone, seg, n_frame, corpus_utt()) != S3_SUCCESS) { free(trans); /* alloc'ed using strdup, not ckd_*() */ free(seg); /* alloc'ed using malloc in areadshort(), not ckd_*() */ ckd_free(word); ckd_free(phone); E_ERROR("ck_seg failed"); continue; } if (cvt2triphone(acmod_set, phone, btw_mark, n_phone) != S3_SUCCESS) { free(trans); /* alloc'ed using strdup, not ckd_*() */ free(seg); /* alloc'ed using malloc in areadshort(), not ckd_*() */ ckd_free(word); ckd_free(phone); E_ERROR("cvt2triphone failed"); continue; } ckd_free(btw_mark); if (mk_seg(acmod_set, seg, n_frame, phone, start, len, n_phone) != S3_SUCCESS) { free(trans); free(seg); ckd_free(word); ckd_free(phone); E_ERROR("mk_seg failed"); continue; } if (corpus_provides_mfcc()) { if (corpus_get_generic_featurevec(&mfcc, &n_frame, mfc_veclen) < 0) { E_FATAL("Can't read input features from %s\n", corpus_utt()); } if (n_frame < 9) { E_WARN("utt %s too short\n", corpus_utt()); if (mfcc) { ckd_free(mfcc[0]); ckd_free(mfcc); mfcc = NULL; } continue; } feat = feat_array_alloc(fcb, n_frame + feat_window_size(fcb)); feat_s2mfc2feat_live(fcb, mfcc, &n_frame, TRUE, TRUE, feat); for (s = 0; s < n_phone; s++) { segdmp_add_feat(phone[s], &feat[start[s]], len[s]); } feat_array_free(feat); free(&mfcc[0][0]); ckd_free(mfcc); } else { E_FATAL("No data type specified\n"); } free(trans); /* alloc'ed using strdup, not ckd_*() */ free(seg); /* alloc'ed using malloc in areadshort(), not ckd_*() */ ckd_free(word); ckd_free(phone); ckd_free(start); ckd_free(len); } return 0; }
/* * Find Viterbi alignment. */ static void align_utt (char *sent, /* In: Reference transcript */ float32 **mfc, /* In: MFC cepstra for input utterance */ int32 nfr, /* In: #frames of input */ char *ctlspec, /* In: Utt specifiction from control file */ char *uttid) /* In: Utterance id, for logging and other use */ { static float32 **feat = NULL; static int32 w; static int32 topn; static gauden_dist_t ***dist; static int32 *senscr; static s3senid_t *sen_active; static int8 *mgau_active; static char *s2stsegdir; static char *stsegdir; static char *phsegdir; static char *wdsegdir; int32 i, s, sid, gid, n_sen_active, best; char *arg; align_stseg_t *stseg; align_phseg_t *phseg; align_wdseg_t *wdseg; if (! feat) { /* One-time allocation of necessary intermediate variables */ /* Allocate space for a feature vector */ feat = (float32 **) ckd_calloc (n_feat, sizeof(float32 *)); for (i = 0; i < n_feat; i++) feat[i] = (float32 *) ckd_calloc (featlen[i], sizeof(float32)); /* Allocate space for top-N codeword density values in a codebook */ w = feat_window_size (); /* #MFC vectors needed on either side of current frame to compute one feature vector */ topn = *((int32 *) cmd_ln_access("-topn")); if (topn > g->n_density) { E_ERROR("-topn argument (%d) > #density codewords (%d); set to latter\n", topn, g->n_density); topn = g->n_density; } dist = (gauden_dist_t ***) ckd_calloc_3d (g->n_mgau, n_feat, topn, sizeof(gauden_dist_t)); /* Space for one frame of senone scores, and per frame active flags */ senscr = (int32 *) ckd_calloc (sen->n_sen, sizeof(int32)); sen_active = (s3senid_t *) ckd_calloc (sen->n_sen, sizeof(s3senid_t)); mgau_active = (int8 *) ckd_calloc (g->n_mgau, sizeof(int8)); /* Note various output directories */ s2stsegdir = NULL; stsegdir = NULL; phsegdir = NULL; wdsegdir = NULL; if ((arg = (char *) cmd_ln_access ("-s2stsegdir")) != NULL) s2stsegdir = (char *) ckd_salloc (arg); if ((arg = (char *) cmd_ln_access ("-stsegdir")) != NULL) stsegdir = (char *) ckd_salloc (arg); if ((arg = (char *) cmd_ln_access ("-phsegdir")) != NULL) phsegdir = (char *) ckd_salloc (arg); if ((arg = (char *) cmd_ln_access ("-wdsegdir")) != NULL) wdsegdir = (char *) ckd_salloc (arg); } /* HACK HACKA HACK BHIKSHA if (nfr <= (w<<1)) { E_ERROR("Utterance %s < %d frames (%d); ignored\n", uttid, (w<<1)+1, nfr); return; } END HACK HACKA HACK */ cyctimer_reset_all (); counter_reset_all (); timing_reset (tm_utt); timing_start (tm_utt); cyctimer_resume (tmr_utt); /* AGC and CMN */ arg = (char *) cmd_ln_access ("-cmn"); if (strcmp (arg, "current") == 0) norm_mean (mfc-4, nfr+8, cepsize); /* -4 HACKA HACK */ arg = (char *) cmd_ln_access ("-agc"); if (strcmp (arg, "max") == 0) agc_max (mfc, nfr); if (align_build_sent_hmm (sent) != 0) { align_destroy_sent_hmm (); cyctimer_pause (tmr_utt); E_ERROR("No sentence HMM; no alignment for %s\n", uttid); return; } align_start_utt (uttid); /* * A feature vector for frame f depends on input MFC vectors [f-w..f+w]. Hence * the feature vector corresponding to the first w and last w input frames is * undefined. We define them by simply replicating the first and last true * feature vectors (presumably silence regions). */ for (i = 0; i < nfr; i++) { cyctimer_resume (tmr_utt); /* Compute feature vector for current frame from input speech cepstra */ /* HACK HACKA HACK BHIKSHA if (i < w) feat_cep2feat (mfc+w, feat); else if (i >= nfr-w) feat_cep2feat (mfc+(nfr-w-1), feat); else END HACK HACKA HACK */ feat_cep2feat (mfc+i, feat); /* * Evaluate gaussian density codebooks and senone scores for input codeword. * Evaluate only active codebooks and senones. */ /* Obtain active senone flags */ cyctimer_resume (tmr_senone); align_sen_active (sen_active, sen->n_sen); /* Flag all CI senones to active if interpolating */ if (interp) { for (s = 0; s < mdef->n_ci_sen; s++) sen_active[s] = 1; } /* Turn active flags into list (for faster access) */ n_sen_active = 0; for (s = 0; s < mdef->n_sen; s++) { if (sen_active[s]) sen_active[n_sen_active++] = s; } cyctimer_pause (tmr_senone); /* Flag all active mixture-gaussian codebooks */ cyctimer_resume (tmr_gauden); for (gid = 0; gid < g->n_mgau; gid++) mgau_active[gid] = 0; for (s = 0; s < n_sen_active; s++) { sid = sen_active[s]; mgau_active[sen->mgau[sid]] = 1; } /* Compute topn gaussian density values (for active codebooks) */ for (gid = 0; gid < g->n_mgau; gid++) if (mgau_active[gid]) gauden_dist (g, gid, topn, feat, dist[gid]); cyctimer_pause (tmr_gauden); /* Evaluate active senones */ cyctimer_resume (tmr_senone); best = (int32) 0x80000000; for (s = 0; s < n_sen_active; s++) { sid = sen_active[s]; senscr[sid] = senone_eval (sen, sid, dist[sen->mgau[sid]], topn); if (best < senscr[sid]) best = senscr[sid]; } if (interp) { for (s = 0; s < n_sen_active; s++) { if ((sid = sen_active[s]) >= mdef->n_ci_sen) interp_cd_ci (interp, senscr, sid, mdef->cd2cisen[sid]); } } /* Normalize senone scores (interpolation above can only lower best score) */ for (s = 0; s < n_sen_active; s++) { sid = sen_active[s]; senscr[sid] -= best; } senscale[i] = best; cyctimer_pause (tmr_senone); /* Step alignment one frame forward */ cyctimer_resume (tmr_align); align_frame (senscr); cyctimer_pause (tmr_align); cyctimer_pause (tmr_utt); } timing_stop (tm_utt); printf ("\n"); /* Wind up alignment for this utterance */ if (align_end_utt (&stseg, &phseg, &wdseg) < 0) E_ERROR("Final state not reached; no alignment for %s\n\n", uttid); else { if (s2stsegdir) write_s2stseg (s2stsegdir, stseg, uttid, ctlspec); if (stsegdir) write_stseg (stsegdir, stseg, uttid, ctlspec); if (phsegdir) write_phseg (phsegdir, phseg, uttid, ctlspec); if (wdsegdir) write_wdseg (wdsegdir, wdseg, uttid, ctlspec); if (outsentfp) write_outsent (outsentfp, wdseg, uttid); } align_destroy_sent_hmm (); cyctimer_print_all_norm (stdout, nfr*0.01, tmr_utt); counter_print_all (stdout); printf("EXECTIME: %5d frames, %7.2f sec CPU, %6.2f xRT; %7.2f sec elapsed, %6.2f xRT\n", nfr, tm_utt->t_cpu, tm_utt->t_cpu * 100.0 / nfr, tm_utt->t_elapsed, tm_utt->t_elapsed * 100.0 / nfr); tot_nfr += nfr; }
/* * Find Viterbi alignment. */ static void align_utt(char *sent, /* In: Reference transcript */ int32 nfr, /* In: #frames of input */ char *ctlspec, /* In: Utt specifiction from control file */ char *uttid) { /* In: Utterance id, for logging and other use */ int32 i; align_stseg_t *stseg; align_phseg_t *phseg; align_wdseg_t *wdseg; int32 w; w = feat_window_size(kbcore_fcb(kbc)); /* #MFC vectors needed on either side of current frame to compute one feature vector */ if (nfr <= (w << 1)) { E_ERROR("Utterance %s < %d frames (%d); ignored\n", uttid, (w << 1) + 1, nfr); return; } ptmr_reset_all(timers); ptmr_reset(&tm_utt); ptmr_start(&tm_utt); ptmr_reset(&tm_ovrhd); ptmr_start(&tm_ovrhd); ptmr_start(timers + tmr_utt); if (align_build_sent_hmm(sent, cmd_ln_int32_r(kbc->config, "-insert_sil")) != 0) { align_destroy_sent_hmm(); ptmr_stop(timers + tmr_utt); E_ERROR("No sentence HMM; no alignment for %s\n", uttid); return; } align_start_utt(uttid); for (i = 0; i < nfr; i++) { ptmr_start(timers + tmr_utt); /* Obtain active senone flags */ ptmr_start(timers + tmr_gauden); ptmr_start(timers + tmr_senone); align_sen_active(ascr->sen_active, ascr->n_sen); /* Bah, there ought to be a function for this. */ if (kbc->ms_mgau) { ms_cont_mgau_frame_eval(ascr, kbc->ms_mgau, kbc->mdef, feat[i], i); } else if (kbc->s2_mgau) { s2_semi_mgau_frame_eval(kbc->s2_mgau, ascr, fastgmm, feat[i], i); } else if (kbc->mgau) { approx_cont_mgau_ci_eval(kbcore_svq(kbc), kbcore_gs(kbc), kbcore_mgau(kbc), fastgmm, kbc->mdef, feat[i][0], ascr->cache_ci_senscr[0], &(ascr->cache_best_list[0]), i, kbcore_logmath(kbc)); approx_cont_mgau_frame_eval(kbcore_mdef(kbc), kbcore_svq(kbc), kbcore_gs(kbc), kbcore_mgau(kbc), fastgmm, ascr, feat[i][0], i, ascr-> cache_ci_senscr[0], &tm_ovrhd, kbcore_logmath(kbc)); } ptmr_stop(timers + tmr_gauden); ptmr_stop(timers + tmr_senone); /* Step alignment one frame forward */ ptmr_start(timers + tmr_align); align_frame(ascr->senscr); ptmr_stop(timers + tmr_align); ptmr_stop(timers + tmr_utt); } ptmr_stop(&tm_utt); ptmr_stop(&tm_ovrhd); printf("\n"); /* Wind up alignment for this utterance */ if (align_end_utt(&stseg, &phseg, &wdseg) < 0) E_ERROR("Final state not reached; no alignment for %s\n\n", uttid); else { if (s2stsegdir) write_s2stseg(s2stsegdir, stseg, uttid, ctlspec, cmd_ln_boolean_r(kbc->config, "-s2cdsen")); if (stsegdir) write_stseg(stsegdir, stseg, uttid, ctlspec); if (phsegdir) write_phseg(phsegdir, phseg, uttid, ctlspec); if (phlabdir) write_phlab(phlabdir, phseg, uttid, ctlspec, cmd_ln_int32_r(kbc->config, "-frate")); if (wdsegdir) write_wdseg(wdsegdir, wdseg, uttid, ctlspec); if (outsentfp) write_outsent(outsentfp, wdseg, uttid); if (outctlfp) write_outctl(outctlfp, ctlspec); } align_destroy_sent_hmm(); ptmr_print_all(stdout, timers, nfr * 0.1); printf ("EXECTIME: %5d frames, %7.2f sec CPU, %6.2f xRT; %7.2f sec elapsed, %6.2f xRT\n", nfr, tm_utt.t_cpu, tm_utt.t_cpu * 100.0 / nfr, tm_utt.t_elapsed, tm_utt.t_elapsed * 100.0 / nfr); tot_nfr += nfr; }
int agg_all_seg(feat_t *fcb, segdmp_type_t type, const char *fn, uint32 stride) { uint32 seq_no; vector_t *mfcc = NULL; uint32 mfc_veclen = cmd_ln_int32("-ceplen"); uint32 n_frame; uint32 n_out_frame; uint32 blksz=0; vector_t **feat = NULL; uint32 i, j; uint32 t; uint32 n_stream; const uint32 *veclen; FILE *fp; uint32 ignore = 0; long start; int32 no_retries=0; n_stream = feat_dimension1(fcb); veclen = feat_stream_lengths(fcb); for (i = 0, blksz = 0; i < n_stream; i++) blksz += veclen[i]; fp = open_dmp(fn); start = ftell(fp); if (s3write(&i, sizeof(uint32), 1, fp, &ignore) != 1) { E_ERROR_SYSTEM("Unable to write to dmp file"); return S3_ERROR; } for (seq_no = corpus_get_begin(), j = 0, n_out_frame = 0; corpus_next_utt(); seq_no++) { if (mfcc) { free(mfcc[0]); ckd_free(mfcc); mfcc = NULL; } /* get the MFCC data for the utterance */ if (corpus_get_generic_featurevec(&mfcc, &n_frame, mfc_veclen) < 0) { E_FATAL("Can't read input features from %s\n", corpus_utt()); } if ((seq_no % 1000) == 0) { E_INFO("[%u]\n", seq_no); } if (feat) { feat_array_free(feat); feat = NULL; } if (n_frame < 9) { E_WARN("utt %s too short\n", corpus_utt()); if (mfcc) { ckd_free(mfcc[0]); ckd_free(mfcc); mfcc = NULL; } continue; } feat = feat_array_alloc(fcb, n_frame + feat_window_size(fcb)); feat_s2mfc2feat_live(fcb, mfcc, &n_frame, TRUE, TRUE, feat); for (t = 0; t < n_frame; t++, j++) { if ((j % stride) == 0) { while (s3write(&feat[t][0][0], sizeof(float32), blksz, fp, &ignore) != blksz) { static int rpt = 0; if (!rpt) { E_ERROR_SYSTEM("Unable to write to dmp file"); E_INFO("sleeping...\n"); no_retries++; } sleep(3); if(no_retries > 10){ E_FATAL("Failed to write to a dmp file after 10 retries of getting MFCC(about 30 seconds)\n "); } } ++n_out_frame; } } } if (fseek(fp, start, SEEK_SET) < 0) { E_ERROR_SYSTEM("Unable to seek to begin of dmp"); return S3_ERROR; } E_INFO("Wrote %u frames to %s\n", n_out_frame, fn); if (s3write((void *)&n_out_frame, sizeof(uint32), 1, fp, &ignore) != 1) { E_ERROR_SYSTEM("Unable to write to dmp file"); return S3_ERROR; } return S3_SUCCESS; }
static void decode_utt (void *data, char *uttfile, int32 sf, int32 ef, char *uttid) { kb_t *kb; acoustic_t *am; int32 featwin, nfr, min_utt_frames, n_vithist; char cepfile[4096], latfile[4096]; vithist_t *finalhist; int32 i, f; glist_t hyplist; FILE *latfp; printf ("\n"); fflush (stdout); E_INFO("Utterance %s\n", uttid); kb = (kb_t *)data; am = kb->am; featwin = feat_window_size(am->fcb); /* Build complete cepfile name and read cepstrum data; check for min length */ ctl_infile (cepfile, cmd_ln_str("-cepdir"), cmd_ln_str("-cepext"), uttfile); if ((nfr = s2mfc_read (cepfile, sf, ef, featwin, am->mfc, S3_MAX_FRAMES)) < 0) { E_ERROR("%s: MFC read failed\n", uttid); return; } E_INFO("%s: %d frames\n", uttid, nfr-(featwin<<1)); ptmr_reset (kb->tm); ptmr_reset (kb->tm_search); ptmr_start (kb->tm); min_utt_frames = (featwin<<1) + 1; if (nfr < min_utt_frames) { E_ERROR("%s: Utterance shorter than %d frames; ignored\n", uttid, min_utt_frames, nfr); return; } /* CMN/AGC */ if (strcmp (cmd_ln_str("-cmn"), "current") == 0) cmn (am->mfc, nfr, feat_cepsize(am->fcb)); if (strcmp (cmd_ln_str("-agc"), "max") == 0) agc_max (am->mfc, nfr); /* Process utterance */ lextree_vit_start (kb, uttid); for (i = featwin, f = 0; i < nfr-featwin; i++, f++) { am->senscale[f] = acoustic_eval (am, i); ptmr_start (kb->tm_search); lextree_vit_frame (kb, f, uttid); printf (" %d,%d,%d", f, glist_count (kb->vithist[f]), glist_count (kb->lextree_active)); fflush (stdout); ptmr_stop (kb->tm_search); } printf ("\n"); finalhist = lextree_vit_end (kb, f, uttid); hyplist = vithist_backtrace (finalhist, kb->am->senscale); hyp_log (stdout, hyplist, _dict_wordstr, (void *)kb->dict); hyp_myfree (hyplist); printf ("\n"); /* Log the entire Viterbi word lattice */ sprintf (latfile, "%s.lat", uttid); if ((latfp = fopen(latfile, "w")) == NULL) { E_ERROR("fopen(%s,w) failed; using stdout\n", latfile); latfp = stdout; } n_vithist = vithist_log (latfp, kb->vithist, f, _dict_wordstr, (void *)kb->dict); if (latfp != stdout) fclose (latfp); else { printf ("\n"); fflush (stdout); } ptmr_stop (kb->tm); if (f > 0) { printf("TMR(%s): %5d frames; %.1fs CPU, %.2f xRT; %.1fs CPU(search), %.2f xRT; %.1fs Elapsed, %.2f xRT\n", uttid, f, kb->tm->t_cpu, kb->tm->t_cpu * 100.0 / f, kb->tm_search->t_cpu, kb->tm_search->t_cpu * 100.0 / f, kb->tm->t_elapsed, kb->tm->t_elapsed * 100.0 / f); printf("CTR(%s): %5d frames; %d Sen (%.1f/fr); %d HMM (%.1f/fr); %d Words (%.1f/fr)\n", uttid, f, kb->n_sen_eval, ((float64)kb->n_sen_eval) / f, kb->n_hmm_eval, ((float64)kb->n_hmm_eval) / f, n_vithist, ((float64) n_vithist) / f); } /* Cleanup */ glist_free (kb->lextree_active); kb->lextree_active = NULL; for (; f >= -1; --f) { /* I.e., including dummy START_WORD node at frame -1 */ glist_myfree (kb->vithist[f], sizeof(vithist_t)); kb->vithist[f] = NULL; } lm_cache_reset (kb->lm); }