/* LoadFile: load whole file or segments and accumulate variance */ static void LoadFile(char *fn) { ParmBuf pbuf; BufferInfo info; char labfn[80]; Transcription *trans; long segStIdx,segEnIdx; int i,j,ncas,nObs; LLink p; if (segId == NULL) { /* load whole parameter file */ if((pbuf=OpenBuffer(&iStack, fn, 0, dff, FALSE_dup, FALSE_dup))==NULL) HError(2050,"LoadFile: Config parameters invalid"); GetBufferInfo(pbuf,&info); CheckData(fn,info); nObs = ObsInBuffer(pbuf); for (i=0; i<nObs; i++){ ReadAsTable(pbuf,i,&obs); AccVar(obs); } if (trace&T_LOAD) { printf(" %d observations loaded from %s\n",nObs,fn); fflush(stdout); } CloseBuffer(pbuf); } else { /* load segment of parameter file */ MakeFN(fn,labDir,labExt,labfn); trans = LOpen(&iStack,labfn,lff); ncas = NumCases(trans->head,segId); if ( ncas > 0) { if((pbuf=OpenBuffer(&iStack, fn, 0, dff, FALSE_dup, FALSE_dup))==NULL) HError(2050,"LoadFile: Config parameters invalid"); GetBufferInfo(pbuf,&info); CheckData(fn,info); for (i=1,nObs=0; i<=ncas; i++) { p = GetCase(trans->head,segId,i); segStIdx= (long) (p->start/info.tgtSampRate); segEnIdx = (long) (p->end/info.tgtSampRate); if (trace&T_SEGS) printf(" loading seg %s [%ld->%ld]\n", segId->name,segStIdx,segEnIdx); if (segEnIdx >= ObsInBuffer(pbuf)) segEnIdx = ObsInBuffer(pbuf)-1; if (segEnIdx >= segStIdx) { for (j=segStIdx;j<=segEnIdx;j++) { ReadAsTable(pbuf,j,&obs); AccVar(obs); ++nObs; } } } CloseBuffer(pbuf); if (trace&T_LOAD) printf(" %d observations loaded from %s\n",nObs,fn); } } ResetHeap(&iStack); }
/* LoadTransLabs: Load transcription from file */ Transcription *LoadTransLabs(char *src) { Transcription *t; MakeFN(src,labDir,labExt,labFile); if(trace & T_SEGMENT) printf("Loading label file %s\n",labFile); t = LOpen(&lStack,labFile,srcLabFF); if(chopF && ! stenSet) SetLabSeg(t); return t; }
/* DoGeneration: Generate parameter sequences from HMMs */ void DoGeneration(char *labfn) { char labFn[MAXFNAMELEN], buf[MAXSTRLEN]; int t; Boolean eSep; Transcription *tr; if (trace & T_TOP) { printf(" Generating Label %s\n", NameOf(labfn, buf)); fflush(stdout); } /* load a given input label file */ ResetHeap(&utt->transStack); MakeFN(labfn, labDir, labExt, labFn); tr = LOpen(&genStack, labFn, lff); /* compose a sentence HMM corresponding to the input label */ InitialiseGenInfo(genInfo, tr, FALSE); /* set utterance informations for forward-backward algorithm */ SetStreamWidths(hmset.pkind, hmset.vecSize, hmset.swidth, &eSep); utt->tr = tr; utt->Q = genInfo->labseqlen; utt->T = genInfo->tframe; utt->twoDataFiles = FALSE; utt->o = (Observation *) New(&gstack, utt->T * sizeof(Observation)); utt->o--; for (t = 1; t <= utt->T; t++) utt->o[t] = MakeObservation(&gstack, hmset.swidth, hmset.pkind, FALSE, eSep); /* parameter generation */ ParamGen(genInfo, utt, fbInfo, type); /* output state durations and generated parameter sequences */ if (!stateAlign) WriteStateDurations(labfn, genInfo); WriteParms(labfn, genInfo); /* free memory */ Dispose(&gstack, ++utt->o); ResetGenInfo(genInfo); /* increment total number of generated frames */ totalT += utt->T; totalPr += utt->pr; return; }
int main(int argc, char *argv[]) { char * labFn, *listfn, *s; int i,fidx; MLFEntry *me = NULL; Transcription *t; void InitStats(char *listfn); void GatherStats(Transcription *t); void OutputStats(void); if(InitShell(argc,argv,hlstats_version,hlstats_vc_id)<SUCCESS) HError(1300,"HLStats: InitShell failed"); InitMem(); InitMath(); InitWave(); InitLabel(); InitLM(); if (!InfoPrinted() && NumArgs() == 0) ReportUsage(); if (NumArgs() == 0) Exit(0); SetConfParms(); enterId=GetLabId("!ENTER",TRUE); /* All sentences should or are coerced */ exitId=GetLabId("!EXIT",TRUE); /* to start enterId and end exitId */ nullId=GetLabId("!NULL",TRUE); /* Name for words not in list */ while (NextArg() == SWITCHARG) { s = GetSwtArg(); if (strlen(s)!=1) HError(1319,"HLStats: Bad switch %s; must be single letter",s); switch(s[0]){ case 'b': doBigram = TRUE; if (NextArg() != STRINGARG) HError(1319,"HLStats: Ngram output file name expected"); bigFile = GetStrArg(); break; case 'c': doLCount = TRUE; lCountLimit = GetChkedInt(0,100000,s); break; case 'd': doDurs = TRUE; break; case 'f': bigFloor = GetChkedFlt(0.0,1000.0,s); break; case 'h': hSize = GetChkedInt(1,2,s); break; case 'l': doList = TRUE; if (NextArg() != STRINGARG) HError(1319,"HLStats: Output label list file name expected"); listFile = GetStrArg(); break; case 'o': doBOff = TRUE; break; case 'p': doPCount = TRUE; pCountLimit = GetChkedInt(0,100000,s); break; case 's': if (NextArg() != STRINGARG) HError(1319,"HLStats: ENTER label name expected"); enterId=GetLabId(GetStrArg(),TRUE); if (NextArg() != STRINGARG) HError(1319,"HLStats: EXIT label name expected"); exitId=GetLabId(GetStrArg(),TRUE); break; case 't': bigThresh = GetChkedInt(0,100,s); break; case 'u': uniFloor = GetChkedFlt(0.0,1000.0,s); break; case 'G': if (NextArg() != STRINGARG) HError(1319,"HLStats: Input label File format expected"); if((ff = Str2Format(GetStrArg())) == ALIEN) HError(-1389,"HLStats: Warning ALIEN Label file format set"); break; case 'I': if (NextArg() != STRINGARG) HError(1319,"HLStats: Input MLF file name expected"); LoadMasterFile(GetStrArg()); break; case 'T': if (NextArg() != INTARG) HError(1319,"HLStats: Trace value expected"); trace = GetChkedInt(0,017,s); break; default: HError(1319,"HLStats: Unknown switch %s",s); } } if (NextArg()!=STRINGARG) HError(1319,"HLStats: Label list file name expected"); listfn = GetStrArg(); if (!(doDurs || doBigram || doList || doLCount || doPCount)) HError(1330,"HLStats: Nothing to do!"); InitStats(listfn); i=0; while (NumArgs()>0) { if (NextArg()!=STRINGARG) HError(1319,"HLStats: Input label file name expected"); labFn = GetStrArg(); if (IsMLFFile(labFn)) { fidx = NumMLFFiles(); if ((me=GetMLFTable()) != NULL) { while(me->next != NULL) me=me->next; LoadMasterFile(labFn); me=me->next; } else { LoadMasterFile(labFn); me=GetMLFTable(); } while (me != NULL) { if (me->type == MLF_IMMEDIATE && me->def.immed.fidx == fidx) { if (trace&T_FIL) { printf(" Processing file %s\n",me->pattern); fflush(stdout); } t = LOpen(&tmpHeap,me->pattern,ff); if (t->numLists<1) HError(-1330,"HLStats: Empty file %s",me->pattern); else GatherStats(t),i++; Dispose(&tmpHeap,t); } me = me->next; if ((trace&T_BAS) && !(trace&T_FIL) && NumMLFEntries()>5000 && i%1000==0) printf(". "),fflush(stdout); } if ((trace&T_BAS) && !(trace&T_FIL) && NumMLFEntries()>5000) printf("\n"); } else { if (trace&T_FIL) { printf(" Processing file %s\n",labFn); fflush(stdout); } t = LOpen(&tmpHeap,labFn,ff); if (t->numLists<1) HError(-1330,"HLStats: Empty file %s",me->pattern); else GatherStats(t),i++; Dispose(&tmpHeap,t); } } if (trace&T_MEM) PrintAllHeapStats(); OutputStats(); if (trace&T_MEM) PrintAllHeapStats(); Exit(0); return (0); /* never reached -- make compiler happy */ }
/* LoadFile: load whole file or segments into segStore */ void LoadFile(char *fn) { BufferInfo info; char labfn[80]; Transcription *trans; long segStIdx,segEnIdx; static int segIdx=1; /* Between call handle on latest seg in segStore */ static int prevSegIdx=1; HTime tStart, tEnd; int i,k,s,ncas,nObs=0,segLen; LLink p; Observation obs; if((pbuf=OpenBuffer(&bufferStack, fn, 10, dff, FALSE_dup, FALSE_dup))==NULL) HError(2150,"LoadFile: Config parameters invalid"); GetBufferInfo(pbuf,&info); CheckData(fn,info); if (firstTime) InitSegStore(&info); if (segId == NULL) { /* load whole parameter file */ nObs = ObsInBuffer(pbuf); tStart = 0.0; tEnd = (info.tgtSampRate * nObs); LoadSegment(segStore, tStart, tEnd, pbuf); segIdx++; } else { /* load segment of parameter file */ MakeFN(fn,labDir,labExt,labfn); trans = LOpen(&transStack,labfn,lff); ncas = NumCases(trans->head,segId); if ( ncas > 0) { for (i=1,nObs=0; i<=ncas; i++) { p = GetCase(trans->head,segId,i); segStIdx = (long)(p->start/info.tgtSampRate); segEnIdx = (long)(p->end/info.tgtSampRate); if (segEnIdx >= ObsInBuffer(pbuf)) segEnIdx = ObsInBuffer(pbuf)-1; if (segEnIdx - segStIdx + 1 >= nStates-2) { LoadSegment(segStore, p->start, p->end, pbuf); if (trace&T_LD1) printf(" loading seg %s %f[%ld]->%f[%ld]\n",segId->name, p->start,segStIdx,p->end,segEnIdx); nObs += SegLength(segStore, segIdx); segIdx++; }else if (trace&T_LD1) printf(" seg %s %f->%f ignored\n",segId->name, p->start,p->end); } } } if (hset.hsKind == DISCRETEHS){ for (k=prevSegIdx; k<segIdx; k++){ segLen = SegLength(segStore, k); for (i=1; i<=segLen; i++){ obs = GetSegObs(segStore, k, i); for (s=1; s<=nStreams; s++){ if( (obs.vq[s] < 1) || (obs.vq[s] > maxMixInS[s])) HError(2150,"LoadFile: Discrete data value [ %d ] out of range in stream [ %d ] in file %s",obs.vq[s],s,fn); } } } prevSegIdx=segIdx; } if (trace&T_LD0) printf(" %d observations loaded from %s\n",nObs,fn); CloseBuffer(pbuf); ResetHeap(&transStack); }
/* LoadFile: load whole file or segments and accumulate variance */ void LoadFile(char *fn) { ParmBuf pbuf; BufferInfo info; char labfn[80]; Transcription *trans; long segStIdx,segEnIdx; int i,s,j,ncas,nObs=0; LLink p; if (segId == NULL) { /* load whole parameter file */ if((pbuf=OpenBuffer(&iStack, fn, 0, dff, FALSE_dup, FALSE_dup))==NULL) HError(2550,"LoadFile: Config parameters invalid"); GetBufferInfo(pbuf,&info); CheckData(fn,info); nObs = ObsInBuffer(pbuf); for (i=0; i<nObs; i++) { for(s=1;s<=swidth[0];s++) obs.fv[s] = CreateVector(&dStack,swidth[s]); ReadAsTable(pbuf,i,&obs); for(s=1;s<=swidth[0];s++) StoreItem(dSeq[s],(Ptr)obs.fv[s]); } CloseBuffer(pbuf); } else { /* load segment of parameter file */ MakeFN(fn,labDir,labExt,labfn); trans = LOpen(&iStack,labfn,lff); ncas = NumCases(trans->head,segId); if ( ncas > 0) { if((pbuf=OpenBuffer(&iStack, fn, 0, dff, FALSE_dup, FALSE_dup))==NULL) HError(2550,"LoadFile: Config parameters invalid"); GetBufferInfo(pbuf,&info); CheckData(fn,info); for (i=1,nObs=0; i<=ncas; i++) { p = GetCase(trans->head,segId,i); segStIdx= (long) (p->start/info.tgtSampRate); segEnIdx = (long) (p->end/info.tgtSampRate); if (trace&T_SEGS) printf(" loading seg %s [%ld->%ld]\n", segId->name,segStIdx,segEnIdx); if (segEnIdx >= ObsInBuffer(pbuf)) segEnIdx = ObsInBuffer(pbuf)-1; if (segEnIdx >= segStIdx) { for (j=segStIdx;j<=segEnIdx;j++) { /* SJY: The HInit code I copied this from had no */ /* SJY: CreateVector call here -- a bug? */ for(s=1;s<=swidth[0];s++) obs.fv[s] = CreateVector(&dStack,swidth[s]); ReadAsTable(pbuf,j,&obs); for(s=1;s<=swidth[0];s++) StoreItem(dSeq[s],(Ptr)obs.fv[s]); ++nObs; } } } CloseBuffer(pbuf); } } ResetHeap(&iStack); if (trace&T_LOAD) { printf(" %5d obs loaded from %s, streams: ",nObs,fn); for(s=1;s<=swidth[0];s++) printf("[%d]" ,swidth[s]); printf("\n"); fflush(stdout); } }
/* ProcessLabelFile: compute perplexity and related statistics from labels */ static void ProcessLabelFile(char *fn, int nSize) { LLink ll; double ppl; LabList *ref; LabId lab; Transcription *tr; int i,numPLabs,nLabel; tr = LOpen(&tempHeap, fn, lff); if (tr->numLists < 1) { HError(-16635,"ProcessLabelFile: transcription file %s is Empty",fn); return; } ref = GetLabelList(tr, 1); if (ref->head->succ == ref->tail) { HError(-16635,"ProcessLabelFile: transcription file %s is Empty",fn); return; } if (trace>0) { printf("Processing label file: %s\n", fn); fflush(stdout); } nLabel = CountLabs(ref); ZeroStats(&sent); sent.nTok = nLabel + 2; sent.nUtt = 1; /* copy labels into pLab, mapping OOVs */ numPLabs = 0; if (sstId!=NULL) /* add sentence start marker(s) */ for (i=0; i<(nSize-1); i++) pLab[numPLabs++] = sstId; for (i=0,ll=ref->head->succ; i<nLabel; i++,ll=ll->succ) { lab = GetEQLab(ll->labid); if ((i==0) && IS_SST(lab)) { sent.nTok--; continue; } if ((i==(nLabel-1)) && IS_SEN(lab)) { sent.nTok--; continue; } if (IS_UNK(lab)) { if (trace&T_OOV) printf("mapping OOV: %s\n", lab->name); StoreOOV(&sent,lab,1); lab = unkId; } pLab[numPLabs++] = lab; if (numPLabs>=LBUF_SIZE) { HError(16650, "Maximum utterance length in a label file exceeded (limit is compiled to be %d tokens)", LBUF_SIZE); } } if (senId!=NULL) /* add sentence end marker */ pLab[numPLabs++] = senId; CalcPerplexity(&sent, pLab, numPLabs, nSize); AddStats(&sent, &totl); if (trace&T_SEL) { /* compact info for sentence selection */ ppl = exp(-(sent.logpp)/(double) (sent.nWrd)); printf("#! %.4f", ppl); for (i=0, ll=ref->head->succ; i<nLabel; i++, ll=ll->succ) printf(" %s", ll->labid->name); printf("\n"); fflush(stdout); } }
/* DoAlignment: by creating network from transcriptions or lattices */ void DoAlignment(void) { FILE *nf; char lfn[MAXSTRLEN], buf[MAXSTRLEN]; Transcription *trans; Network *net; Boolean isPipe; int n=0; LogDouble currGenBeam; AdaptXForm *incXForm; if (trace&T_TOP) { if (loadNetworks) printf("New network will be used for each file\n"); else printf("Label file will be used to align each file\n"); fflush(stdout); } CreateHeap(&netHeap,"Net heap",MSTAK,1,0,8000,80000); while (NumArgs()>0) { if (NextArg() != STRINGARG) HError(3219,"DoAlignment: Data file name expected"); datFN = GetStrArg(); if (trace&T_TOP) { printf("Aligning File: %s\n",datFN); fflush(stdout); } if (labFileMask != NULL ) { /* support for rescoring lattice masks */ if (!MaskMatch(labFileMask,buf,datFN)) HError(2319,"DoAlignment: mask %s has no match with segemnt %s",labFileMask,datFN); MakeFN(buf,labInDir,labInExt,lfn); } else { MakeFN(datFN,labInDir,labInExt,lfn); } if (loadNetworks) { if ( (nf = FOpen(lfn,NetFilter,&isPipe)) == NULL) HError(3210,"DoAlignment: Cannot open Word Net file %s",lfn); if((wdNet = ReadLattice(nf,&netHeap,&vocab,TRUE,FALSE))==NULL) HError(3210,"DoAlignment: ReadLattice failed"); FClose(nf,isPipe); if (trace&T_TOP) { printf("Read lattice with %d nodes / %d arcs\n", wdNet->nn,wdNet->na); fflush(stdout); } } else { LabList *ll = NULL; trans=LOpen(&netHeap,lfn,ifmt); if (trans->numLists >= 1) ll = GetLabelList(trans,1); if (!ll && !bndId) HError(3233, "DoAlignment: cannot align empty transcription"); wdNet=LatticeFromLabels(ll, bndId, &vocab,&netHeap); if (trace&T_TOP) { printf("Created lattice with %d nodes / %d arcs from label file\n", wdNet->nn,wdNet->na); fflush(stdout); } } net=ExpandWordNet(&netHeap,wdNet,&vocab,&hset); ++n; currGenBeam = genBeam; /* This handles the initial input transform, parent transform setting and output transform creation */ if (UpdateSpkrStats(&hset, &xfInfo, datFN) && (!(xfInfo.useInXForm)) && (hset.semiTied == NULL)) { xfInfo.inXForm = NULL; } if (genBeamInc == 0.0) ProcessFile (datFN, net, n, currGenBeam, FALSE); else { Boolean completed; completed = ProcessFile (datFN, net, n, currGenBeam, TRUE); currGenBeam += genBeamInc; while (!completed && (currGenBeam <= genBeamLim - genBeamInc)) { completed = ProcessFile (datFN, net, n, currGenBeam, TRUE); currGenBeam += genBeamInc; } if (!completed) ProcessFile (datFN, net, n, currGenBeam, FALSE); } if (update > 0 && n%update == 0) { if (trace&T_TOP) { printf("Transforming model set\n"); fflush(stdout); } /* at every stage a new transform is created - fix?? Estimate transform and then set it up as the input XForm */ incXForm = CreateAdaptXForm(&hset,"inc"); TidyBaseAccs(); GenAdaptXForm(&hset,incXForm); xfInfo.inXForm = GetMLLRDiagCov(incXForm);; SetXForm(&hset,xfInfo.inXForm); ApplyHMMSetXForm(&hset,xfInfo.inXForm); } ResetHeap(&netHeap); } }