/* ReadBoNGram: read and store WSJ/DP format ngram */ static void ReadBoNGram(LModel *lm,char *fn) { NGramLM *nglm; int i,j,k,counts[NSIZE+1]; Boolean ngBin[NSIZE+1]; char buf[MAXSTRLEN+1],syc[64]; char ngFmtCh; if (trace&T_TIO) printf("\nBOffB "),fflush(stdout); if(InitSource(fn,&source,LangModFilter)<SUCCESS) HError(8110,"ReadBoNGram: Can't open file %s", fn); GetInLine(buf); SyncStr(buf,"\\data\\"); for (i=1;i<=NSIZE;i++) counts[i]=0; for (i=1;i<=NSIZE;i++) { GetInLine(buf); if (sscanf(buf, "ngram %d%c%d", &j, &ngFmtCh, &k)!=3 && i>1) break; if (i!=j || k==0) HError(8150,"ReadBoNGram: %dGram count missing (%s)",i,buf); switch (ngFmtCh) { case '=': ngBin[j] = FALSE; break; case '~': ngBin[j] = TRUE; break; default: HError (9999, "ReadARPALM: unknown ngram format type '%c'", ngFmtCh); } counts[j]=k; } if (ngBin[1]) HError (8113, "ReadARPALM: unigram must be stored as text"); nglm=CreateBoNGram(lm,counts[1],counts); for (i=1;i<=nglm->nsize;i++) { sprintf(syc,"\\%d-grams:",i); SyncStr(buf,syc); ReadNGrams(nglm,i,nglm->counts[i], ngBin[i]); } SyncStr(buf,"\\end\\"); CloseSource(&source); if (trace&T_TIO) { printf("\n NEntry==%d ",nglm->counts[0]); for(i=1;i<=nglm->nsize;i++) printf(" %d-Grams==%d",i,nglm->counts[i]); printf("\n\n"); fflush(stdout); } }
/* OutputBoBigram: output ARPA/MIL-LL style back off bigram */ void OutputBoBigram(void) { LModel lm; NGramLM *nglm; NEntry *ne; SEntry *se; AEntry **aelists; lmId ndx[NSIZE]; int i,tot,counts[NSIZE+1]; double uent,ent,bent; lm.heap=&statHeap; lm.type=boNGram; counts[1]=lSize;counts[2]=nae; for(i=3;i<NSIZE+1;i++) counts[i]=0; nglm=CreateBoNGram(&lm,lSize,counts); /* Give max size at creation */ for (i=1;i<=lSize;i++) nglm->wdlist[i]=lTab[i].name; aelists=(AEntry**)New(&tmpHeap,sizeof(AEntry*)*(lSize+1)); for (i=1;i<=lSize;i++) aelists[i]=NULL; RebuildAETab(aelists); /* Un-hash hashtable */ for (i=1,tot=0.0;i<=lSize;i++) { /* Calculate unigrams first */ if (i==(int)enterId->aux) nglm->unigrams[i]=0.0; else if (lTab[i].count<uniFloor) nglm->unigrams[i]=uniFloor; else nglm->unigrams[i]=lTab[i].count; tot+=nglm->unigrams[i]; } for (i=1,uent=0.0;i<=lSize;i++,se++) { nglm->unigrams[i]=nglm->unigrams[i]/tot; uent-=ent2(nglm->unigrams[i]); } nglm->counts[1]=lSize; /* Calculate real sizes during build */ nglm->counts[2]=0; for (i=0; i<NSIZE; i++) ndx[i]=0; if (trace&T_BIG) { printf("\n UNIGRAM NEntry - %4d foll, ent %.3f [= %.3f]\n\n", lSize,uent,pow(2.0,uent)); printf(" BIGRAMS NEntries\n"); fflush(stdout); } for (i=1,bent=0.0;i<=lSize;i++) { ndx[0]=i; ne=GetNEntry(nglm,ndx,TRUE); ne->user=aelists[i]; ent = BuildNEntry(ne,nglm->unigrams,uent); nglm->counts[2]+=ne->nse; if (trace&T_BIG) if (i!=(int)exitId->aux){ if (i==(int)enterId->aux) bent+=nglm->unigrams[(int)exitId->aux]*ent; else bent+=nglm->unigrams[i]*ent; printf(" %-20s - %4d foll, ent %6.3f [= %6.2f]\n", lTab[i].name->name,ne->nse,ent,pow(2.0,ent)); fflush(stdout); } } Dispose(&tmpHeap,aelists); if (trace&T_BIG) { printf("\n BIGRAM: training data entropy %.3f (perplexity %.2f)\n", bent,pow(2.0,bent)); fflush(stdout); } ndx[0]=0; /* Set up unigram nentry separately */ ne=GetNEntry(nglm,ndx,TRUE); ne->nse=lSize; se=ne->se=(SEntry*)New(nglm->heap,sizeof(SEntry)*lSize); for (i=1;i<=lSize;i++,se++) { se->word=i; if (nglm->unigrams[i]>0) se->prob=nglm->unigrams[i]=log(nglm->unigrams[i]); else se->prob=nglm->unigrams[i]=LZERO; } lm.name=CopyString(lm.heap,bigFile); /* Name and write to disk */ WriteLModel(&lm,bigFile,0); }