/* OutputMatBigram: output matrix style bigram */ void OutputMatBigram(void) { LModel lm; MatBiLM *matbi; AEntry **aelists,*ae; Vector vec; double vsum,fsum,tot,scale; double ent,bent,prob,fent; int i,j,nf,tf=0,nu,tu=0,np,tp=0,tn=0; lm.heap=&statHeap; lm.type=matBigram; matbi=CreateMatBigram(&lm,lSize); for (i=1;i<=lSize;i++) matbi->wdlist[i]=lTab[i].name; aelists=(AEntry**)New(&tmpHeap,sizeof(AEntry*)*(lSize+1)); for (i=1;i<=lSize;i++) aelists[i]=NULL; RebuildAETab(aelists); /* Un-hash hashtable */ if (trace&T_BIG) { printf("\n BIGRAMS from MatBigram\n"); fflush(stdout); } bent=0.0; fent = ent2(bigFloor); for (i=1;i<=lSize;i++) { vec=matbi->bigMat[i]; for (ae=aelists[i],tot=0.0; ae!=NULL; ae=ae->link) if (ae->word[0]!=0) tot += ae->count; fsum = (lSize-1)*bigFloor; vsum=0.0; for (ae=aelists[i];ae!=NULL;ae=ae->link) if (ae->count/tot > bigFloor && ae->word[0]!=0) fsum -= bigFloor, vsum += ae->count; else ae->count=0; scale = (1.0 - fsum) / vsum; for (j=1;j<=lSize;j++) { if (j==(int)enterId->aux) vec[j]=0.0; else if (tot==0.0) vec[j]=1.0/(lSize-1); else vec[j]=bigFloor; } for (ae=aelists[i];ae!=NULL;ae=ae->link) if (ae->count>0) vec[ae->word[0]]=ae->count*scale; if (trace&T_BIG) { nf=nu=np=0; if (tot==0.0) ent=-log2(1.0/(lSize-1)),prob=1.0,nu=lSize-1; else ent=-(lSize-1)*fent, prob=bigFloor*(lSize-1), nf+=lSize-1; for (ae=aelists[i];ae!=NULL;ae=ae->link) if (ae->count>0) { prob += vec[ae->word[0]]-bigFloor; ent -= ent2(vec[ae->word[0]]); ent += fent; nf--; np++; } if (i!=(int)exitId->aux){ j=lTab[i].count; bent+=j*ent;tn+=j; if (tot==0.0) printf(" %-20s - %4d unis, ent %6.3f [= %6.2f] (P=%7.5f)\n", lTab[i].name->name,nu,ent,pow(2.0,ent),prob); else printf(" %-20s - %4d foll, ent %6.3f [= %6.2f] (P=%7.5f)\n", lTab[i].name->name,np,ent,pow(2.0,ent),prob); fflush(stdout); } tf+=nf;tu+=nu;tp+=np; } } if (trace&T_BIG) { bent/=tn; printf("\n BIGRAM: training data entropy %.3f (perplexity %.2f)\n", bent,pow(2.0,bent)); printf(" Estimated %d, floored %d, unigrammed %d for %d\n", tp,tf,tu,lSize); fflush(stdout); } Dispose(&tmpHeap,aelists); /* convert probabilities to logs */ for (i=1;i<=matbi->numWords;i++) { vec = matbi->bigMat[i]; for (j=1; j<=matbi->numWords; j++){ vec[j] = ((vec[j]<MINLARG)?LZERO:log(vec[j])); } } lm.name=CopyString(lm.heap,bigFile); /* Name and write to disk */ WriteLModel(&lm,bigFile,0); }
/* ReadBigram: load a bigram from given file */ static void ReadMatBigram(LModel *lm,char *fn) { Vector vec; char buf[132]; int P,p,j; float sum,x; LabId id; MatBiLM *matbi; if (trace&T_TIO) printf("\nMB "),fflush(stdout); if(InitSource(fn,&source,LangModFilter)<SUCCESS) HError(8110,"ReadMatBigram: Can't open file %s", fn); vec = CreateVector(&gcheap,MAX_LMID); ReadLMWord(buf);SkipWhiteSpace(&source); id=GetLabId(buf,TRUE); P = ReadRow(vec); if (P<=0 || P >MAX_LMID) HError(8151,"ReadMatBigram: First row invalid (%d entries)",P); matbi=CreateMatBigram(lm,P); matbi->wdlist[1] = id; for (p=1;p<=P;p++) matbi->bigMat[1][p]=vec[p]; id->aux=(Ptr) 1; Dispose(&gcheap,vec); for (sum=0.0, j=1; j<=P; j++) { x = matbi->bigMat[1][j]; if (x<0) HError(8151,"ReadMatBigram: In bigram, entry %d for %s is -ve (%e)", j,buf,x); sum += x; matbi->bigMat[1][j]=((x<MINLARG)?LZERO:log(x)); } if (sum < 0.99 || sum > 1.01) HError(-8151,"ReadMatBigram: Row %d of bigram %s adds up to %f",1,fn,sum); for (p=2; ReadLMWord(buf); p++) { if (trace&T_TIO) { if ((p%25)==0) printf(". "),fflush(stdout); if ((p%800)==0) printf("\n "),fflush(stdout); } if (p>P) HError(8150,"ReadMatBigram: More rows than columns in bigram %s",fn); id=GetLabId(buf,TRUE); if ((int)id->aux != 0) HError(8150,"ReadMatBigram: Duplicated name %s in bigram %s",buf,fn); id->aux = (Ptr) p; matbi->wdlist[p] = id; SkipWhiteSpace(&source); if (ReadRow(matbi->bigMat[p])!=P) HError(8150,"ReadMatBigram: Wrong number of items in row %d",p); for (sum=0.0, j=1; j<=P; j++) { x = matbi->bigMat[p][j]; if (x<0) HError(8151,"ReadMatBigram: In bigram, entry %d for %s is -ve (%e)", j,buf,x); sum += x; matbi->bigMat[p][j]=((x<MINLARG)?LZERO:log(x)); } if (sum < 0.99 || sum > 1.01) HError(-8151,"ReadMatBigram: Row %d of bigram %s adds up to %f",p,fn,sum); } if (P>p) HError(8150,"ReadMatBigram: More columns than rows in bigram %s",fn); if (trace&T_TIO) printf("\n"),fflush(stdout); CloseSource(&source); }