} void ReportUsage(void) { printf("\nUSAGE: LMerge [options] wordList inModel outModel\n\n"); printf(" Option Default\n\n"); printf(" -f s set output LM format to s %s\n", ReturnLMName(DEF_SAVEFMT)); printf(" -i f s interpolate with model s, weight f off\n"); printf(" -n n produce n-gram model max\n"); PrintStdOpts("GIST"); printf("\n\n"); } int main(int argc, char *argv[]) { int i; char *s,*c;
/* CombineModels: load models and combine with the one in memory */ BackOffLM *CombineModels(MemHeap *heap,LMInfo *lmi,int nLModel,int nSize,WordMap *wl) { int i,j,nw; float x; LMInfo *li; BackOffLM *tgtLM; WordMap wordList; LabId lab; NameId *na; /* normalise weights */ for (x=0.0, i=1; i<nLModel; i++) x += lmInfo[i].weight; lmInfo[0].weight = 1.0-x; /* load all models except the first one*/ for (li=lmInfo+1, i=1; i<nLModel; i++, li++) { if (trace&T_TOP) printf("Loading language model from %s\n",li->fn); li->lm = LoadLangModel(li->fn,wl,1.0,LMP_FLOAT,heap); } if (wl==NULL) { wl = &wordList; /* derive word list from LMs */ for (li=lmInfo, i=0; i<nLModel; i++, li++) { na = li->lm->binMap; for (j=0; j<li->lm->vocSize; j++) { lab = GetLabId(na[j+1]->name,TRUE); lab->aux=NULL; } } for (nw=0,li=lmInfo, i=0; i<nLModel; i++, li++) { na = li->lm->binMap; for (j=0; j<li->lm->vocSize; j++) { lab = GetLabId(na[j+1]->name,FALSE); if (lab->aux==NULL) { nw++; lab->aux = (Ptr) wl; } } } CreateWordList(NULL,wl,nw+10); for (nw=0,li=lmInfo, i=0; i<nLModel; i++, li++) { na = li->lm->binMap; for (j=0; j<li->lm->vocSize; j++) { lab = GetLabId(na[j+1]->name,FALSE); if (lab->aux==(Ptr) wl) { wl->id[nw++]=lab; lab->aux = NULL; } } } wl->used = nw; } if (trace&T_TOP) { printf("Using language model(s): \n"); for (li=lmInfo,i=0; i<nLModel; i++,li++) printf(" %d-gram %s, weight %.2f\n",li->lm->nSize,li->fn,li->weight); } if (trace&T_TOP) { printf("Generating %d-gram model %s\n",nSize,outFN); fflush(stdout); } tgtLM = MergeModels(heap,lmInfo,nLModel,nSize,wl); #ifdef HTK_CRYPT if (tgtLM->encrypt && binfo.saveFmt==LMF_TEXT) binfo.saveFmt = LMF_BINARY; #endif for (i=1; i<=nSize; i++) { tgtLM->gInfo[i].fmt = (i==1) ? LMF_TEXT : binfo.saveFmt; } return tgtLM; }
int main(int argc, char *argv[]) { int i; char *s,*c; char fmt[256]; dictList *dEntry,*d; InitShell(argc,argv,prog_version,prog_vc_id); InitMem(); InitMath(); InitWave(); InitLabel(); InitDict(); InitWMap(); InitLUtil(); InitLModel(); InitPCalc(); if (!InfoPrinted() && NumArgs() == 0) ReportUsage(); if (NumArgs() == 0) Exit(EXIT_SUCCESS); SetConfParms(); CreateHeap(&langHeap,"langHeap",MSTAK,1,0.5,5000,40000); for (i=1; i<=LM_NSIZE; i++) cutOff[i] = 0; while (NextArg() == SWITCHARG) { s = GetSwtArg(); if (strlen(s)!=1) HError(16919,"Bad switch %s; must be single letter",s); switch(s[0]){ case 'c': i = GetChkedInt(2,LM_NSIZE,s); cutOff[i] = GetChkedInt(1,1000,s); break; case 'd': if (NextArg()!=STRINGARG) HError(16919,"LMCopy: Input dictionary file name expected"); dEntry=New(&gcheap,sizeof(dictList)); dEntry->fname=GetStrArg(); dEntry->next=NULL; if (dList==NULL) dList=dEntry; else { for (d=dList;d->next!=NULL;d=d->next); d->next=dEntry; } break; case 'f': strcpy(fmt,GetStrArg()); for (c=fmt; *c!=0; *c=toupper(*c), c++); if (strcmp(fmt, LM_TXT_TEXT)==0) saveFmt = LMF_TEXT; else if (strcmp(fmt, LM_TXT_BINARY)==0) saveFmt = LMF_BINARY; else if (strcmp(fmt, LM_TXT_ULTRA)==0) saveFmt = LMF_ULTRA; else HError(16919,"Unrecognised LM format, should be one of [%s, %s, %s]", LM_TXT_TEXT, LM_TXT_BINARY, LM_TXT_ULTRA); break; case 'm': remDup=FALSE; break; case 'n': nSize = GetChkedInt(1,LM_NSIZE,s); break; case 'o': firstOnly=TRUE; break; case 'u': if (NextArg()!=STRINGARG) HError(16919,"LMCopy: Unigram file name expected"); uniFn = GetStrArg(); break; case 'v': if (NextArg()!=STRINGARG) HError(16919,"LMCopy: Dictionary output file name expected"); outDictFn = GetStrArg(); break; case 'w': if (NextArg() != STRINGARG) HError(16919,"LPlex: Word list file name expected"); wlistFN = GetStrArg(); break; case 'T': trace = GetChkedInt(0,077, s); break; default: HError(16919,"LMPlex: Unknown switch %s",s); } } if (NextArg()!=STRINGARG) /* load the language model */ HError(16919, "Input language model filename expected"); srcFN = GetStrArg(); if (NextArg()!=STRINGARG) /* load the language model */ HError(16919, "Output language model filename expected"); tgtFN= GetStrArg(); if (wlistFN!=NULL) { InitVocab(&vocab); if(ReadDict(wlistFN,&vocab) < SUCCESS) HError(16913,"Could not read dict in %s", wlistFN); if (trace&T_TOP) { printf("Loaded %d words from %s\n",vocab.nwords,wlistFN); fflush(stdout); } voc = &vocab; CreateWordList(wlistFN,&wlist,10); lm = LoadLangModel(srcFN,&wlist,1.0,LMP_FLOAT|LMP_COUNT,&langHeap); } else { voc = NULL; lm = LoadLangModel(srcFN,NULL,1.0,LMP_FLOAT|LMP_COUNT,&langHeap); } if (trace&T_TOP) { printf("Loaded model from %s\n",srcFN); fflush(stdout); } if (lm->probType==LMP_COUNT) { RebuildLM(lm, cutOff, NULL, LMP_FLOAT); /* GLM there was no threshold before! */ } if (uniFn!=NULL) ReplaceUnigrams(uniFn,lm); if (nSize>0 && nSize<lm->nSize) lm->nSize = nSize; #ifdef HTK_CRYPT if (lm->encrypt && saveFmt==LMF_TEXT) saveFmt = LMF_BINARY; #endif for (i=1;i<=lm->nSize;i++) lm->gInfo[i].fmt = (i==1) ? LMF_TEXT : saveFmt; SaveLangModel(tgtFN,lm); if (trace&T_TOP) { printf("Wrote model to %s\n",tgtFN); fflush(stdout); } if (outDictFn) { MakeDictionary(outDictFn,dList,voc); } Exit(EXIT_SUCCESS); return EXIT_SUCCESS; /* never reached -- make compiler happy */ }
/* Initialise: perform global initialisations */ static void Initialise(void) { int i,j,ndx; float x; LMInfo *li; Boolean inLM; LabId *wid,lab; NameId *na,nid; Boolean isPipe; nulClass = GetLabId(nulName,TRUE); /* normalise weights */ for (x=0.0, i=1; i<nLModel; i++) x += lmInfo[i].weight; lmInfo[0].weight = 1.0-x; /* load all models */ for (li=lmInfo, i=0; i<nLModel; i++, li++) { if (trace&T_TOP) printf("Loading language model from %s\n",li->fn); li->lm = LoadLangModel(li->fn,NULL,1.0,LMP_LOG|LMP_COUNT,&permHeap); if (li->lm->probType==LMP_COUNT) RebuildLM(li->lm,cutOff,wdThresh,LMP_LOG); AttachAccessInfo(li->lm); } if (trace&T_TOP) { printf("Using language model(s): \n"); for (li=lmInfo,i=0; i<nLModel; i++,li++) printf(" %d-gram %s, weight %.2f\n",li->lm->nSize,li->fn,li->weight); } if (numTests==0) { numTests=1; testInfo[0] = lmInfo[0].lm->nSize; } /* load or create word list */ if (wlistFN!=NULL) { /* load word list from file */ CreateWordList(wlistFN,&wList,nWords+10); nWords = wList.used; for (wid=wList.id, i=0; i<nWords; i++,wid++) /* assign lookup indices */ (*wid)->aux = (Ptr) (i+1); } else { /* derive word list from LMs */ for (nWords=0,li=lmInfo, i=0; i<nLModel; i++, li++) { /* Obtain class-LM word list in a different way */ if (li->lm->classLM) { na = li->lm->classBM; for (j=0; j<li->lm->classW; j++) { lab = GetLabId(na[j+1]->name, TRUE); if (lab->aux==NULL) lab->aux = (Ptr) (++nWords); } } else { na = li->lm->binMap; for (j=0; j<li->lm->vocSize; j++) { lab = GetLabId(na[j+1]->name,TRUE); if (lab->aux==NULL) lab->aux = (Ptr) (++nWords); } } } CreateWordList(NULL,&wList,nWords+10); for (li=lmInfo, i=0; i<nLModel; i++, li++) { /* Obtain class-LM word list in a different way */ if (li->lm->classLM) { na = li->lm->classBM; for (j=0; j<li->lm->classW; j++) { lab = GetLabId(na[j+1]->name,TRUE); ndx = ((int) lab->aux) - 1; wList.id[ndx] = lab; } } else { na = li->lm->binMap; for (j=0; j<li->lm->vocSize; j++) { lab = GetLabId(na[j+1]->name,TRUE); ndx = ((int) lab->aux) - 1; wList.id[ndx] = lab; } } } wList.used = nWords; } if (trace&T_TOP) { printf("Found %d unique words in %d model(s)\n",nWords,nLModel); fflush(stdout); } if (unkId->aux==NULL && !skipOOV) { HError(16620,"LPlex: OOV class symbol %s not in word list",unkId->name); } if (sstId->aux==NULL) { HError(16620,"LPlex: sentence start symbol %s not in word list",sstId->name); } if (senId->aux==NULL) { HError(16620,"LPlex: sentence end symbol %s not in word list",senId->name); } /* create lookup table */ l2nId = (NameId **) New(&permHeap,nLModel*sizeof(NameId *)); /* create LabId -> NameId lookup arrays (one per LM) */ for (li=lmInfo, i=0; i<nLModel; i++, li++, na++) { na = (NameId *) New(&permHeap,(nWords+2)*sizeof(NameId)); for (wid = wList.id, j=0; j<nWords; j++, wid++) { if (li->lm->classLM) { nid = na[(int) ((*wid)->aux)] = GetNameId(li->lm->classH, (*wid)->name, FALSE); } else { nid = na[(int) ((*wid)->aux)] = GetNameId(li->lm->htab, (*wid)->name, FALSE); } #ifdef SANITY if (nid==NULL) HError(-16625,"Unable to find word %s in model %s\n",(*wid)->name,li->fn); #endif } l2nId[i] = na; } /* ensure words present at least in one model */ for (wid = wList.id, j=0; j<nWords; j++, wid++) { for (inLM=FALSE,i=0; i<nLModel; i++, li++) if (l2nId[i][(int) ((*wid)->aux)]!=NULL) inLM = TRUE; if (!inLM) HError(16625,"Unable to find word %s in any model\n",(*wid)->name); } /* create equivalence class lookup array */ eqId = (LabId *) New(&permHeap,(nWords+NumEquiv()+2)*sizeof(NameId)); for (wid = wList.id, i=0; i<nWords; i++, wid++) { eqId[(int) ((*wid)->aux)] = NULL; } /* link equivalence classes */ LinkEquiv(); /* open output stream */ if (outStreamFN != NULL) if ((outStream = FOpen(outStreamFN,NoOFilter,&isPipe)) == NULL) HError(16610,"Initialise: unable to open output file %s",outStreamFN); }
int main(int argc, char *argv[]) { int i; char *s,*c; char fmt[256]; InitShell(argc,argv,lnorm_version,lnorm_vc_id); InitMem(); InitMath(); InitWave(); InitLabel(); InitWMap(); InitLUtil(); InitLModel(); InitPCalc(); if (!InfoPrinted() && NumArgs() == 0) ReportUsage(); if (NumArgs() == 0) Exit(EXIT_SUCCESS); SetConfParms(); CreateHeap(&langHeap,"langHeap",MSTAK,1,0.5,5000,40000); for (i=1; i<=LM_NSIZE; i++) cutOff[i] = 0, wdThresh[i] = 0.0; while (NextArg() == SWITCHARG) { s = GetSwtArg(); if (strlen(s)!=1) HError(16519,"Bad switch %s; must be single letter",s); switch(s[0]){ case 'c': i = GetChkedInt(2,LM_NSIZE,s); cutOff[i] = GetChkedInt(1,1000,s); break; case 'd': i = GetChkedInt(2,LM_NSIZE,s); wdThresh[i] = GetChkedFlt(0.0,1E10,s); break; case 'f': strcpy(fmt,GetStrArg()); for (c=fmt; *c!=0; *c=toupper(*c),c++); if (strcmp(fmt, LM_TXT_TEXT)==0) saveFmt = LMF_TEXT; else if (strcmp(fmt, LM_TXT_BINARY)==0) saveFmt = LMF_BINARY; else if (strcmp(fmt, LM_TXT_ULTRA)==0) saveFmt = LMF_ULTRA; else HError(16519,"Unrecognised LM format, should be one of [%s, %s, %s]", LM_TXT_TEXT, LM_TXT_BINARY, LM_TXT_ULTRA); break; case 'n': nSize = GetChkedInt(1,LM_NSIZE,s); break; case 'w': if (NextArg() != STRINGARG) HError(16519,"LPlex: Word list file name expected"); wlistFN = GetStrArg(); break; case 'T': trace = GetChkedInt(0,077, s); break; default: HError(16519,"LMPlex: Unknown switch %s",s); } } if (NextArg()!=STRINGARG) /* load the language model */ HError(16519, "Input language model filename expected"); srcFN = GetStrArg(); if (NextArg()!=STRINGARG) /* load the language model */ HError(16519, "Output language model filename expected"); tgtFN= GetStrArg(); if (wlistFN!=NULL) { CreateWordList(wlistFN,&wlist,10); lm = LoadLangModel(srcFN,&wlist,1.0,LMP_FLOAT|LMP_COUNT,&langHeap); } else { lm = LoadLangModel(srcFN,NULL,1.0,LMP_FLOAT|LMP_COUNT,&langHeap); } if (lm->probType==LMP_COUNT) { RebuildLM(lm,cutOff,wdThresh,LMP_FLOAT); } else { NormaliseLM(lm); } if (nSize>0 && nSize<lm->nSize) lm->nSize = nSize; for (i=1;i<=lm->nSize;i++) lm->gInfo[i].fmt = (i==1) ? LMF_TEXT : saveFmt; SaveLangModel(tgtFN,lm); Exit(EXIT_SUCCESS); return EXIT_SUCCESS; /* never reached -- make compiler happy */ }