Ejemplo n.º 1
0
/* ReadBoNGram: read and store WSJ/DP format ngram */
static void ReadBoNGram(LModel *lm,char *fn)
{
   NGramLM *nglm;
   int i,j,k,counts[NSIZE+1];
   Boolean ngBin[NSIZE+1];
   char buf[MAXSTRLEN+1],syc[64];
   char ngFmtCh;

   if (trace&T_TIO)
      printf("\nBOffB "),fflush(stdout);

   if(InitSource(fn,&source,LangModFilter)<SUCCESS)
      HError(8110,"ReadBoNGram: Can't open file %s", fn);
   GetInLine(buf);
   SyncStr(buf,"\\data\\");
   for (i=1;i<=NSIZE;i++) counts[i]=0;
   for (i=1;i<=NSIZE;i++) {
      GetInLine(buf);
      if (sscanf(buf, "ngram %d%c%d", &j, &ngFmtCh, &k)!=3 && i>1)
         break;
      if (i!=j || k==0) 
         HError(8150,"ReadBoNGram: %dGram count missing (%s)",i,buf);

      switch (ngFmtCh) {
      case '=':
         ngBin[j] = FALSE;
         break;
      case '~':
         ngBin[j] = TRUE;
         break;
      default:
         HError (9999, "ReadARPALM: unknown ngram format type '%c'", ngFmtCh);
      }
      counts[j]=k;
   }

   if (ngBin[1])
      HError (8113, "ReadARPALM: unigram must be stored as text");

   nglm=CreateBoNGram(lm,counts[1],counts);
   for (i=1;i<=nglm->nsize;i++) {
      sprintf(syc,"\\%d-grams:",i);
      SyncStr(buf,syc);
      ReadNGrams(nglm,i,nglm->counts[i], ngBin[i]);
   }
   SyncStr(buf,"\\end\\");
   CloseSource(&source);

   if (trace&T_TIO) {
      printf("\n NEntry==%d ",nglm->counts[0]);
      for(i=1;i<=nglm->nsize;i++)
         printf(" %d-Grams==%d",i,nglm->counts[i]);
      printf("\n\n");
      fflush(stdout);
   }
}
Ejemplo n.º 2
0
/* OutputBoBigram: output ARPA/MIL-LL style back off bigram */
void OutputBoBigram(void)
{
   LModel lm;
   NGramLM *nglm;
   NEntry *ne;
   SEntry *se;
   AEntry **aelists;
   lmId ndx[NSIZE];
   int i,tot,counts[NSIZE+1];
   double uent,ent,bent;

   lm.heap=&statHeap;
   lm.type=boNGram;
   counts[1]=lSize;counts[2]=nae;
   for(i=3;i<NSIZE+1;i++)
      counts[i]=0;
   nglm=CreateBoNGram(&lm,lSize,counts);  /* Give max size at creation */
   for (i=1;i<=lSize;i++)
      nglm->wdlist[i]=lTab[i].name;

   aelists=(AEntry**)New(&tmpHeap,sizeof(AEntry*)*(lSize+1));
   for (i=1;i<=lSize;i++) aelists[i]=NULL;
   RebuildAETab(aelists);          /* Un-hash hashtable */

   for (i=1,tot=0.0;i<=lSize;i++) {    /* Calculate unigrams first */
      if (i==(int)enterId->aux)
         nglm->unigrams[i]=0.0;
      else if (lTab[i].count<uniFloor)
         nglm->unigrams[i]=uniFloor;
      else
         nglm->unigrams[i]=lTab[i].count;
      tot+=nglm->unigrams[i];
   }
   for (i=1,uent=0.0;i<=lSize;i++,se++) {
      nglm->unigrams[i]=nglm->unigrams[i]/tot;
      uent-=ent2(nglm->unigrams[i]);
   }

   nglm->counts[1]=lSize;           /* Calculate real sizes during build */
   nglm->counts[2]=0;
   for (i=0; i<NSIZE; i++) ndx[i]=0;
   if (trace&T_BIG) {
      printf("\n  UNIGRAM NEntry        - %4d foll, ent %.3f [= %.3f]\n\n",
             lSize,uent,pow(2.0,uent));
      printf("  BIGRAMS NEntries\n");
      fflush(stdout);
   }
   for (i=1,bent=0.0;i<=lSize;i++) {
      ndx[0]=i;
      ne=GetNEntry(nglm,ndx,TRUE);
      ne->user=aelists[i];
      ent = BuildNEntry(ne,nglm->unigrams,uent);
      nglm->counts[2]+=ne->nse;
      if (trace&T_BIG) 
         if (i!=(int)exitId->aux){
            if (i==(int)enterId->aux)
               bent+=nglm->unigrams[(int)exitId->aux]*ent;
            else 
               bent+=nglm->unigrams[i]*ent;
            printf("   %-20s - %4d foll, ent %6.3f [= %6.2f]\n",
                   lTab[i].name->name,ne->nse,ent,pow(2.0,ent));
            fflush(stdout);
         }
   }
   Dispose(&tmpHeap,aelists);
   
   if (trace&T_BIG) {
      printf("\n  BIGRAM: training data entropy %.3f (perplexity %.2f)\n",
             bent,pow(2.0,bent));
      fflush(stdout);
   }

   ndx[0]=0;                        /* Set up unigram nentry separately */
   ne=GetNEntry(nglm,ndx,TRUE);
   ne->nse=lSize;
   se=ne->se=(SEntry*)New(nglm->heap,sizeof(SEntry)*lSize);
   for (i=1;i<=lSize;i++,se++) {
      se->word=i;
      if (nglm->unigrams[i]>0)
         se->prob=nglm->unigrams[i]=log(nglm->unigrams[i]);
      else
         se->prob=nglm->unigrams[i]=LZERO;
   }  

   lm.name=CopyString(lm.heap,bigFile); /* Name and write to disk */
   WriteLModel(&lm,bigFile,0);
}