Beispiel #1
0
/* OutputMatBigram: output matrix style bigram */
void OutputMatBigram(void)
{
   LModel lm;
   MatBiLM *matbi;
   AEntry **aelists,*ae;
   Vector vec;
   double vsum,fsum,tot,scale;
   double ent,bent,prob,fent;
   int i,j,nf,tf=0,nu,tu=0,np,tp=0,tn=0;

   lm.heap=&statHeap;
   lm.type=matBigram;
   matbi=CreateMatBigram(&lm,lSize);

   for (i=1;i<=lSize;i++)
      matbi->wdlist[i]=lTab[i].name;

   aelists=(AEntry**)New(&tmpHeap,sizeof(AEntry*)*(lSize+1));
   for (i=1;i<=lSize;i++) aelists[i]=NULL;
   RebuildAETab(aelists);          /* Un-hash hashtable */

   if (trace&T_BIG) {
      printf("\n  BIGRAMS from MatBigram\n");
      fflush(stdout);
   }
   bent=0.0;
   fent = ent2(bigFloor);
   for (i=1;i<=lSize;i++) {
      vec=matbi->bigMat[i];
      for (ae=aelists[i],tot=0.0; ae!=NULL; ae=ae->link)
         if (ae->word[0]!=0) tot += ae->count;
      fsum = (lSize-1)*bigFloor; vsum=0.0;
      for (ae=aelists[i];ae!=NULL;ae=ae->link)
         if (ae->count/tot > bigFloor && ae->word[0]!=0)
            fsum -= bigFloor, vsum += ae->count;
         else
            ae->count=0;
      scale = (1.0 - fsum) / vsum;
      for (j=1;j<=lSize;j++) {
         if (j==(int)enterId->aux) vec[j]=0.0;
         else if (tot==0.0) vec[j]=1.0/(lSize-1);
         else vec[j]=bigFloor;
      }
      for (ae=aelists[i];ae!=NULL;ae=ae->link)
         if (ae->count>0)
            vec[ae->word[0]]=ae->count*scale;
      if (trace&T_BIG) {
         nf=nu=np=0;
         if (tot==0.0) 
            ent=-log2(1.0/(lSize-1)),prob=1.0,nu=lSize-1;
         else
            ent=-(lSize-1)*fent,
               prob=bigFloor*(lSize-1),
               nf+=lSize-1;
         for (ae=aelists[i];ae!=NULL;ae=ae->link)
            if (ae->count>0) {
               prob += vec[ae->word[0]]-bigFloor;
               ent -= ent2(vec[ae->word[0]]);
               ent += fent;
               nf--;  np++;
            }
         if (i!=(int)exitId->aux){
            j=lTab[i].count;
            bent+=j*ent;tn+=j;
            if (tot==0.0)
               printf("   %-20s - %4d unis, ent %6.3f [= %6.2f] (P=%7.5f)\n",
                      lTab[i].name->name,nu,ent,pow(2.0,ent),prob);
            else
               printf("   %-20s - %4d foll, ent %6.3f [= %6.2f] (P=%7.5f)\n",
                      lTab[i].name->name,np,ent,pow(2.0,ent),prob);
            fflush(stdout);
         }
         tf+=nf;tu+=nu;tp+=np;
      }
   }
   if (trace&T_BIG) {
      bent/=tn;
      printf("\n  BIGRAM: training data entropy %.3f (perplexity %.2f)\n",
             bent,pow(2.0,bent));
      printf("         Estimated %d, floored %d, unigrammed %d for %d\n",
             tp,tf,tu,lSize);
      fflush(stdout);
   }

   Dispose(&tmpHeap,aelists);

   /* convert probabilities to logs */
   for (i=1;i<=matbi->numWords;i++) {
      vec = matbi->bigMat[i];
      for (j=1; j<=matbi->numWords; j++){
         vec[j] = ((vec[j]<MINLARG)?LZERO:log(vec[j]));
      }
   }
   lm.name=CopyString(lm.heap,bigFile); /* Name and write to disk */
   WriteLModel(&lm,bigFile,0);
}
Beispiel #2
0
/* ReadBigram: load a bigram from given file */
static void ReadMatBigram(LModel *lm,char *fn)
{
   Vector vec;
   char buf[132];
   int P,p,j;
   float sum,x;
   LabId id;
   MatBiLM *matbi;
  
   if (trace&T_TIO)
      printf("\nMB "),fflush(stdout);

   if(InitSource(fn,&source,LangModFilter)<SUCCESS)
      HError(8110,"ReadMatBigram: Can't open file %s", fn);
   vec = CreateVector(&gcheap,MAX_LMID);
   ReadLMWord(buf);SkipWhiteSpace(&source);
   id=GetLabId(buf,TRUE);
   P = ReadRow(vec);

   if (P<=0 || P >MAX_LMID)
      HError(8151,"ReadMatBigram: First row invalid (%d entries)",P);

   matbi=CreateMatBigram(lm,P);

   matbi->wdlist[1] = id;
   for (p=1;p<=P;p++) matbi->bigMat[1][p]=vec[p];
   id->aux=(Ptr) 1;
   Dispose(&gcheap,vec);

   for (sum=0.0, j=1; j<=P; j++) {
      x = matbi->bigMat[1][j];
      if (x<0)
         HError(8151,"ReadMatBigram: In bigram, entry %d for %s is -ve (%e)",
                j,buf,x);
      sum += x;
      matbi->bigMat[1][j]=((x<MINLARG)?LZERO:log(x));
   }
   if (sum < 0.99 || sum > 1.01)
      HError(-8151,"ReadMatBigram: Row %d of bigram %s adds up to %f",1,fn,sum);

   for (p=2; ReadLMWord(buf); p++) {
      if (trace&T_TIO) {
         if ((p%25)==0)
            printf(". "),fflush(stdout);
         if ((p%800)==0)
            printf("\n   "),fflush(stdout);
      }
      if (p>P)
         HError(8150,"ReadMatBigram: More rows than columns in bigram %s",fn);
      id=GetLabId(buf,TRUE);
      if ((int)id->aux != 0) 
         HError(8150,"ReadMatBigram: Duplicated name %s in bigram %s",buf,fn);
      id->aux = (Ptr) p;
      matbi->wdlist[p] = id;
      SkipWhiteSpace(&source);
      if (ReadRow(matbi->bigMat[p])!=P)
         HError(8150,"ReadMatBigram: Wrong number of items in row %d",p);
      for (sum=0.0, j=1; j<=P; j++) {
         x = matbi->bigMat[p][j];
         if (x<0)
            HError(8151,"ReadMatBigram: In bigram, entry %d for %s is -ve (%e)",
                   j,buf,x);
         sum += x;
         matbi->bigMat[p][j]=((x<MINLARG)?LZERO:log(x));
      }
      if (sum < 0.99 || sum > 1.01)
         HError(-8151,"ReadMatBigram: Row %d of bigram %s adds up to %f",p,fn,sum);
   }
   if (P>p)
      HError(8150,"ReadMatBigram: More columns than rows in bigram %s",fn);
   if (trace&T_TIO)
      printf("\n"),fflush(stdout);
   CloseSource(&source);
}