/* EXPORT GetLMProb: return probability of word wd_id following pr_id[] */ float GetLMProb(LModel *lm, LabId prid[NSIZE], LabId wdid) { LabId cpid[NSIZE]; NEntry *ne; SEntry *se; lmId p, q, word, ndx[NSIZE]; LogFloat bowt,prob; int i, s; switch (lm->type) { case boNGram: word = (int)wdid->aux; if (word==0 || word>lm->data.ngram->vocSize) return(LZERO); for (s=-1,i=0;i<NSIZE;i++) if (prid[i]!=NULL) ndx[i]=(int)prid[i]->aux, cpid[i]=prid[i], s=i; else ndx[i]=0, cpid[i]=NULL; /* If no answer back-off to unigram */ if (s<0) { if (word!=0) return(lm->data.ngram->unigrams[word]); else return(log(1.0/lm->data.ngram->vocSize)); } cpid[s]=0; ne = GetNEntry(lm->data.ngram,ndx,FALSE); if (ne) { /* Replace with bsearch equivalent */ for (i=0, se=ne->se; i<ne->nse; i++,se++) if (se->word==word) return(se->prob); /* Ngram found */ bowt=ne->bowt; } else { bowt=0.0; } if (s==0) return(lm->data.ngram->unigrams[word]+bowt); /* Backoff to unigram */ else return(bowt+GetLMProb(lm,cpid,wdid)); /* else recurse */ break; case matBigram: p=(int) prid[0]->aux; q=(int) wdid->aux; return(lm->data.matbi->bigMat[p][q]); default: prob=LZERO; } return(prob); }
/* EXPORT->CreateBoNGram: Allocate and create basic NGram structures */ NGramLM *CreateBoNGram(LModel *lm,int vocSize, int counts[NSIZE]) { lmId ndx[NSIZE]; int i,k; NGramLM *nglm; nglm = (NGramLM *) New(lm->heap, sizeof(NGramLM)); lm->data.ngram = nglm; nglm->heap = lm->heap; for (i=0;i<=NSIZE;i++) nglm->counts[i]=0; for (i=1;i<=NSIZE;i++) if (counts[i]==0) break; else nglm->counts[i]=counts[i]; nglm->nsize=i-1; /* Don't count final layer */ for (k=0,i=1;i<nglm->nsize;i++) k+=nglm->counts[i]; /* Then use total to guess NEntry hash size */ if (k<25000) nglm->hashsize=NGHSIZE1; else if (k<250000) nglm->hashsize=NGHSIZE2; else nglm->hashsize=NGHSIZE3; nglm->hashtab=(NEntry **) New(lm->heap,sizeof(NEntry*)*nglm->hashsize); for (i=0; i<nglm->hashsize; i++) nglm->hashtab[i]=NULL; nglm->vocSize = vocSize; nglm->unigrams = CreateVector(lm->heap,nglm->vocSize); nglm->wdlist = (LabId *) New(lm->heap,nglm->vocSize*sizeof(LabId)); nglm->wdlist--; for (i=1;i<=nglm->vocSize;i++) nglm->wdlist[i]=NULL; for (i=0;i<NSIZE;i++) ndx[i]=0; GetNEntry(nglm,ndx,TRUE); return(nglm); }
/* OutputBoBigram: output ARPA/MIL-LL style back off bigram */ void OutputBoBigram(void) { LModel lm; NGramLM *nglm; NEntry *ne; SEntry *se; AEntry **aelists; lmId ndx[NSIZE]; int i,tot,counts[NSIZE+1]; double uent,ent,bent; lm.heap=&statHeap; lm.type=boNGram; counts[1]=lSize;counts[2]=nae; for(i=3;i<NSIZE+1;i++) counts[i]=0; nglm=CreateBoNGram(&lm,lSize,counts); /* Give max size at creation */ for (i=1;i<=lSize;i++) nglm->wdlist[i]=lTab[i].name; aelists=(AEntry**)New(&tmpHeap,sizeof(AEntry*)*(lSize+1)); for (i=1;i<=lSize;i++) aelists[i]=NULL; RebuildAETab(aelists); /* Un-hash hashtable */ for (i=1,tot=0.0;i<=lSize;i++) { /* Calculate unigrams first */ if (i==(int)enterId->aux) nglm->unigrams[i]=0.0; else if (lTab[i].count<uniFloor) nglm->unigrams[i]=uniFloor; else nglm->unigrams[i]=lTab[i].count; tot+=nglm->unigrams[i]; } for (i=1,uent=0.0;i<=lSize;i++,se++) { nglm->unigrams[i]=nglm->unigrams[i]/tot; uent-=ent2(nglm->unigrams[i]); } nglm->counts[1]=lSize; /* Calculate real sizes during build */ nglm->counts[2]=0; for (i=0; i<NSIZE; i++) ndx[i]=0; if (trace&T_BIG) { printf("\n UNIGRAM NEntry - %4d foll, ent %.3f [= %.3f]\n\n", lSize,uent,pow(2.0,uent)); printf(" BIGRAMS NEntries\n"); fflush(stdout); } for (i=1,bent=0.0;i<=lSize;i++) { ndx[0]=i; ne=GetNEntry(nglm,ndx,TRUE); ne->user=aelists[i]; ent = BuildNEntry(ne,nglm->unigrams,uent); nglm->counts[2]+=ne->nse; if (trace&T_BIG) if (i!=(int)exitId->aux){ if (i==(int)enterId->aux) bent+=nglm->unigrams[(int)exitId->aux]*ent; else bent+=nglm->unigrams[i]*ent; printf(" %-20s - %4d foll, ent %6.3f [= %6.2f]\n", lTab[i].name->name,ne->nse,ent,pow(2.0,ent)); fflush(stdout); } } Dispose(&tmpHeap,aelists); if (trace&T_BIG) { printf("\n BIGRAM: training data entropy %.3f (perplexity %.2f)\n", bent,pow(2.0,bent)); fflush(stdout); } ndx[0]=0; /* Set up unigram nentry separately */ ne=GetNEntry(nglm,ndx,TRUE); ne->nse=lSize; se=ne->se=(SEntry*)New(nglm->heap,sizeof(SEntry)*lSize); for (i=1;i<=lSize;i++,se++) { se->word=i; if (nglm->unigrams[i]>0) se->prob=nglm->unigrams[i]=log(nglm->unigrams[i]); else se->prob=nglm->unigrams[i]=LZERO; } lm.name=CopyString(lm.heap,bigFile); /* Name and write to disk */ WriteLModel(&lm,bigFile,0); }
HError(3019,"HBuild: Can only specifiy one of -m, -n, -w, -x"); bType = wordPair; if (NextArg()!=STRINGARG) HError(3019,"HBuild: Word pair grammar file name expected"); ipFn = GetStrArg(); break; case 'x': if (bType != unknown) HError(3019,"HBuild: Can only specifiy one of -m, -n, -w, -x"); bType = multiLat; if (NextArg()!=STRINGARG) HError(3019,"HBuild: Multi-level lattice file name expected"); ipFn = GetStrArg(); break; case 'z': zapUnknown = TRUE; break; case 'T': trace = GetChkedInt(0,511,s); break; default: HError(3019,"HBuild: Unknown switch %s",s); } } if (NextArg()!=STRINGARG) HError(3019,"HBuild: Word List file name expected"); wordListFn = GetStrArg(); if (NextArg()!=STRINGARG) HError(3019,"HBuild: output lattice file name expected"); latFn = GetStrArg(); if (bType == unknown) bType = wordLoop; if (saveLatBin) format |= HLAT_LBIN; /* Read the word-list into a Vocab data structure */ InitVocab(&voc); if(ReadDict(wordListFn, &voc)<SUCCESS) HError(3013,"HBuild: ReadDict failed"); switch (bType) { case matBiGram: if (trace & T_TOP) printf("Reading bigram from file %s\n",ipFn); bigramLm = ReadLModel(&gstack, ipFn); if (bigramLm->type != matBigram) HError(3030,"HBuild: File specified is not a matrix bigram"); lat = ProcessBiGram(&gstack,&voc,bigramLm); SaveLattice(lat,latFn,format); break; case boBiGram: if (trace & T_TOP)
/* LMTransProb_ngram return logprob of transition from src labelled word. Also return dest state. ngram case */ LogFloat LMTrans (LModel *lm, LMState src, LabId wdid, LMState *dest) { NGramLM *nglm; LogFloat lmprob; lmId hist[NSIZE] = {0}; /* initialise whole array to zero! */ int i, l; NEntry *ne; SEntry *se; lmId word; assert (lm->type == boNGram); nglm = lm->data.ngram; word = (int) wdid->aux; if (word==0 || word>lm->data.ngram->vocSize) { HError (-9999, "word %d not in LM wordlist", word); *dest = NULL; return (LZERO); } ne = src; if (!src) { /* unigram case */ lmprob = nglm->unigrams[word]; } else { /* lookup prob p(word | src) */ /* try to find pronid in SEntry array */ se = FindSEntry (ne->se, word, 0, ne->nse - 1); assert (!se || (se->word == word)); if (se) /* found */ lmprob = se->prob; else { /* not found */ lmprob = 0.0; l = 0; hist[NSIZE-1] = 0; for (i = 0; i < NSIZE-1; ++i) { hist[i] = ne->word[i]; if (hist[i] != 0) l = i; } /* l is now the index of the last (oldest) non zero element */ for ( ; l > 0; --l) { if (ne) lmprob += ne->bowt; hist[l] = 0; /* back-off: discard oldest word */ ne = GetNEntry (nglm, hist, FALSE); if (ne) { /* skip over non existing hists. fix for weird LMs */ /* try to find pronid in SEntry array */ se = FindSEntry (ne->se, word, 0, ne->nse - 1); assert (!se || (se->word == word)); if (se) { /* found it */ lmprob += se->prob; l = -1; break; } } } if (l == 0) { /* backed-off all the way to unigram */ assert (!se); lmprob += ne->bowt; lmprob += nglm->unigrams[word]; } } } /* now determine dest state */ if (src) { ne = (NEntry *) src; l = 0; hist[NSIZE-1] = 0; for (i = 1; i < NSIZE-1; ++i) { hist[i] = ne->word[i-1]; if (hist[i] != 0) l = i; } /* l is now the index of the last (oldest) non zero element */ } else { for (i = 1; i < NSIZE-1; ++i) hist[i] = 0; l = 1; } hist[0] = word; ne = (LMState) GetNEntry (nglm, hist, FALSE); for ( ; !ne && (l > 0); --l) { hist[l] = 0; /* back off */ ne = (LMState) GetNEntry (nglm, hist, FALSE); } /* if we left the loop because l=0, then ne is still NULL, which is what we want */ *dest = ne; #if 0 printf ("lmprob = %f dest %p\n", lmprob, *dest); #endif return (lmprob); }
/* ReadNGrams: read n grams list from file */ static int ReadNGrams(NGramLM *nglm,int n,int count, Boolean bin) { float prob; LabId wdid; SEntry *cse; char wd[255]; lmId ndx[NSIZE+1]; NEntry *ne,*le=NULL; int i, g, idx, total; unsigned char size, flags=0; cse = (SEntry *) New(nglm->heap,count*sizeof(SEntry)); for (i=1;i<=NSIZE;i++) ndx[i]=0; if (trace&T_TIO) printf("\nn%1d ",n),fflush(stdout); total=0; for (g=1; g<=count; g++){ PROGRESS(g); if (bin) { size = GetCh (&source); flags = GetCh (&source); } prob = GetFloat(bin)*LN10; if (n==1) { /* unigram treated as special */ ReadLMWord(wd); wdid = GetLabId(wd, TRUE); if (wdid->aux != NULL) HError(8150,"ReadNGrams: Duplicate word (%s) in 1-gram list", wdid->name); wdid->aux = (Ptr)g; nglm->wdlist[g] = wdid; nglm->unigrams[g] = prob; ndx[0]=g; } else { /* bigram, trigram, etc. */ for (i=0;i<n;i++) { if (bin) { if (flags & BIN_ARPA_INT_LMID) { unsigned int ui; if (!ReadInt (&source, (int *) &ui, 1, bin)) HError (9999, "ReadNGrams: failed reading int lm word id"); idx = ui; } else { unsigned short us; if (!ReadShort (&source, (short *) &us, 1, bin)) HError (9999, "ReadNGrams: failed reading short lm word id at"); idx = us; } } else { ReadLMWord(wd); wdid = GetLabId(wd, FALSE); idx = (wdid==NULL?0:(int)wdid->aux); } if (idx<1 || idx>nglm->vocSize) HError(8150,"ReadNGrams: Unseen word (%s) in %dGram",wd,n); ndx[n-1-i]=idx; } } total++; ne = GetNEntry(nglm,ndx+1,FALSE); if (ne == NULL) HError(8150,"ReadNGrams: Backoff weight not seen for %dth %dGram",g,n); if (ne!=le) { if (le != NULL && ne->se != NULL) HError(8150,"ReadNGrams: %dth %dGrams out of order",g,n); if (le != NULL) { if (le->nse==0) { le->se=NULL; } else { qsort(le->se,le->nse,sizeof(SEntry),se_cmp); } } ne->se = cse; ne->nse = 0; le = ne; } cse->prob = prob; cse->word = ndx[0]; ne->nse++; cse++; /* read back-off weight */ if (bin) { if (flags & BIN_ARPA_HAS_BOWT) { ne = GetNEntry(nglm,ndx,TRUE); ne->bowt = GetFloat (TRUE)*LN10; } } else { SkipWhiteSpace(&source); if (!source.wasNewline) { ne=GetNEntry(nglm,ndx,TRUE); ne->bowt = GetFloat(FALSE)*LN10; } } } /* deal with the last accumulated set */ if (le != NULL) { if (le->nse==0) { le->se=NULL; } else { qsort(le->se,le->nse,sizeof(SEntry),se_cmp); } } if (trace&T_TIO) printf("\n"),fflush(stdout); return(total); }
/* WriteNGram: Write n grams to file */ static int WriteNGrams(FILE *file,NGramLM *nglm,int n,float scale) { NEntry *ne,*be,*ce,**neTab; SEntry *se; LogFloat prob; lmId ndx[NSIZE+1]; int c,i,j,k,N,g=1,hash,neCnt,total; if (trace&T_TIO) printf("\nn%1d ",n),fflush(stdout); fprintf(file,"\n\\%d-grams:\n",n); N=VectorSize(nglm->unigrams); neTab=(NEntry **) New(&gstack,sizeof(NEntry*)*nglm->counts[0]); for (hash=neCnt=0;hash<nglm->hashsize;hash++) for (ne=nglm->hashtab[hash]; ne!=NULL; ne=ne->link) { for (i=1,ce=ne;i<n;i++) if (ne->word[i-1]==0) { ce=NULL; break; } if (ce!=NULL) for (i=n;i<NSIZE;i++) if (ne->word[i-1]!=0) { ce=NULL; break; } if (ce!=NULL && ce->nse>0) neTab[neCnt++]=ce; } qsort(neTab,neCnt,sizeof(NEntry*),nep_cmp); total=0; for (c=n;c<=NSIZE;c++) ndx[c]=0; for (j=0;j<neCnt;j++) { ne=neTab[j]; for (c=1;c<n;c++) ndx[c]=ne->word[c-1]; if (ne!=NULL && ne->nse>0) { for (i=0,se=ne->se;i<ne->nse;i++,se++) { if (trace&T_TIO) { if ((g%25000)==0) printf(". "),fflush(stdout); if ((g%800000)==0) printf("\n "),fflush(stdout); g++; } ndx[0]=se->word; if (n<nglm->nsize) be=GetNEntry(nglm,ndx,FALSE); else be=NULL; if (be==NULL || be->nse==0) be=NULL; total++; if (n==1) prob=nglm->unigrams[se->word]; else prob=se->prob; if (prob*scale<-99.999) fprintf(file,"%+6.3f",-99.999); else fprintf(file,"%+6.4f",prob*scale); c='\t'; for (k=n-1;k>=0;k--) if (rawMITFormat) fprintf(file,"%c%s",c,nglm->wdlist[ndx[k]]->name),c=' '; else fprintf(file,"%c%s",c, ReWriteString(nglm->wdlist[ndx[k]]->name, NULL,ESCAPE_CHAR)),c=' '; if (be!=NULL) fprintf(file,"\t%+6.4f\n",be->bowt*scale); else fprintf(file,"\n"); } } } Dispose(&gstack,neTab); if (trace&T_TIO) printf("\n"),fflush(stdout); return(total); }