示例#1
0
/* EXPORT GetLMProb: return probability of word wd_id following pr_id[] */
float GetLMProb(LModel *lm, LabId prid[NSIZE], LabId wdid)
{
   LabId cpid[NSIZE];
   NEntry *ne;
   SEntry *se;
   lmId p, q, word, ndx[NSIZE];
   LogFloat bowt,prob;
   int i, s;
  
   switch (lm->type) {
   case boNGram:
      word = (int)wdid->aux;
      if (word==0 || word>lm->data.ngram->vocSize)
         return(LZERO);
      for (s=-1,i=0;i<NSIZE;i++)
         if (prid[i]!=NULL) 
            ndx[i]=(int)prid[i]->aux, cpid[i]=prid[i], s=i;
         else
            ndx[i]=0, cpid[i]=NULL;

      /* If no answer back-off to unigram */
      if (s<0) {
         if (word!=0)
            return(lm->data.ngram->unigrams[word]);
         else
            return(log(1.0/lm->data.ngram->vocSize));
      }

      cpid[s]=0;
      ne = GetNEntry(lm->data.ngram,ndx,FALSE);
      if (ne) {
         /* Replace with bsearch equivalent */
         for (i=0, se=ne->se; i<ne->nse; i++,se++)
            if (se->word==word) 
               return(se->prob); /* Ngram found */
         bowt=ne->bowt;
      }
      else {
         bowt=0.0;
      }
    
      if (s==0)
         return(lm->data.ngram->unigrams[word]+bowt); /* Backoff to unigram */
      else
         return(bowt+GetLMProb(lm,cpid,wdid)); /* else recurse */
      break;
   case matBigram:
      p=(int) prid[0]->aux;
      q=(int) wdid->aux;
      return(lm->data.matbi->bigMat[p][q]);
   default:
      prob=LZERO;
   }
   return(prob);
}
示例#2
0
/* EXPORT->CreateBoNGram: Allocate and create basic NGram structures */
NGramLM *CreateBoNGram(LModel *lm,int vocSize, int counts[NSIZE])
{
   lmId ndx[NSIZE];
   int i,k;
   NGramLM *nglm;

   nglm = (NGramLM *) New(lm->heap, sizeof(NGramLM));
   lm->data.ngram = nglm;
   nglm->heap = lm->heap;

   for (i=0;i<=NSIZE;i++) nglm->counts[i]=0;
   for (i=1;i<=NSIZE;i++)
      if (counts[i]==0) break;
      else nglm->counts[i]=counts[i];
   nglm->nsize=i-1;

   /* Don't count final layer */
   for (k=0,i=1;i<nglm->nsize;i++) 
      k+=nglm->counts[i];
   /* Then use total to guess NEntry hash size */
   if (k<25000) 
      nglm->hashsize=NGHSIZE1;
   else if (k<250000) 
      nglm->hashsize=NGHSIZE2;
   else 
      nglm->hashsize=NGHSIZE3;

   nglm->hashtab=(NEntry **) New(lm->heap,sizeof(NEntry*)*nglm->hashsize);
   for (i=0; i<nglm->hashsize; i++) 
      nglm->hashtab[i]=NULL;

   nglm->vocSize = vocSize;
   nglm->unigrams = CreateVector(lm->heap,nglm->vocSize);
   nglm->wdlist = (LabId *) New(lm->heap,nglm->vocSize*sizeof(LabId)); nglm->wdlist--;
   for (i=1;i<=nglm->vocSize;i++) nglm->wdlist[i]=NULL;

   for (i=0;i<NSIZE;i++) ndx[i]=0;
   GetNEntry(nglm,ndx,TRUE);

   return(nglm);
}   
示例#3
0
/* OutputBoBigram: output ARPA/MIL-LL style back off bigram */
void OutputBoBigram(void)
{
   LModel lm;
   NGramLM *nglm;
   NEntry *ne;
   SEntry *se;
   AEntry **aelists;
   lmId ndx[NSIZE];
   int i,tot,counts[NSIZE+1];
   double uent,ent,bent;

   lm.heap=&statHeap;
   lm.type=boNGram;
   counts[1]=lSize;counts[2]=nae;
   for(i=3;i<NSIZE+1;i++)
      counts[i]=0;
   nglm=CreateBoNGram(&lm,lSize,counts);  /* Give max size at creation */
   for (i=1;i<=lSize;i++)
      nglm->wdlist[i]=lTab[i].name;

   aelists=(AEntry**)New(&tmpHeap,sizeof(AEntry*)*(lSize+1));
   for (i=1;i<=lSize;i++) aelists[i]=NULL;
   RebuildAETab(aelists);          /* Un-hash hashtable */

   for (i=1,tot=0.0;i<=lSize;i++) {    /* Calculate unigrams first */
      if (i==(int)enterId->aux)
         nglm->unigrams[i]=0.0;
      else if (lTab[i].count<uniFloor)
         nglm->unigrams[i]=uniFloor;
      else
         nglm->unigrams[i]=lTab[i].count;
      tot+=nglm->unigrams[i];
   }
   for (i=1,uent=0.0;i<=lSize;i++,se++) {
      nglm->unigrams[i]=nglm->unigrams[i]/tot;
      uent-=ent2(nglm->unigrams[i]);
   }

   nglm->counts[1]=lSize;           /* Calculate real sizes during build */
   nglm->counts[2]=0;
   for (i=0; i<NSIZE; i++) ndx[i]=0;
   if (trace&T_BIG) {
      printf("\n  UNIGRAM NEntry        - %4d foll, ent %.3f [= %.3f]\n\n",
             lSize,uent,pow(2.0,uent));
      printf("  BIGRAMS NEntries\n");
      fflush(stdout);
   }
   for (i=1,bent=0.0;i<=lSize;i++) {
      ndx[0]=i;
      ne=GetNEntry(nglm,ndx,TRUE);
      ne->user=aelists[i];
      ent = BuildNEntry(ne,nglm->unigrams,uent);
      nglm->counts[2]+=ne->nse;
      if (trace&T_BIG) 
         if (i!=(int)exitId->aux){
            if (i==(int)enterId->aux)
               bent+=nglm->unigrams[(int)exitId->aux]*ent;
            else 
               bent+=nglm->unigrams[i]*ent;
            printf("   %-20s - %4d foll, ent %6.3f [= %6.2f]\n",
                   lTab[i].name->name,ne->nse,ent,pow(2.0,ent));
            fflush(stdout);
         }
   }
   Dispose(&tmpHeap,aelists);
   
   if (trace&T_BIG) {
      printf("\n  BIGRAM: training data entropy %.3f (perplexity %.2f)\n",
             bent,pow(2.0,bent));
      fflush(stdout);
   }

   ndx[0]=0;                        /* Set up unigram nentry separately */
   ne=GetNEntry(nglm,ndx,TRUE);
   ne->nse=lSize;
   se=ne->se=(SEntry*)New(nglm->heap,sizeof(SEntry)*lSize);
   for (i=1;i<=lSize;i++,se++) {
      se->word=i;
      if (nglm->unigrams[i]>0)
         se->prob=nglm->unigrams[i]=log(nglm->unigrams[i]);
      else
         se->prob=nglm->unigrams[i]=LZERO;
   }  

   lm.name=CopyString(lm.heap,bigFile); /* Name and write to disk */
   WriteLModel(&lm,bigFile,0);
}
示例#4
0
            HError(3019,"HBuild: Can only specifiy one of -m, -n, -w, -x");

         bType = wordPair;

         if (NextArg()!=STRINGARG)

            HError(3019,"HBuild: Word pair grammar file name expected");

         ipFn = GetStrArg(); 

         break;

      case 'x':

         if (bType != unknown)

            HError(3019,"HBuild: Can only specifiy one of -m, -n, -w, -x");

         bType = multiLat;

         if (NextArg()!=STRINGARG)

            HError(3019,"HBuild: Multi-level lattice file name expected");

         ipFn = GetStrArg(); 

         break;

      case 'z':

         zapUnknown = TRUE; break;    

      case 'T':

         trace = GetChkedInt(0,511,s); break;

      default:

         HError(3019,"HBuild: Unknown switch %s",s);

      }

   } 

   if (NextArg()!=STRINGARG)

      HError(3019,"HBuild: Word List file name expected");

   wordListFn = GetStrArg();

   if (NextArg()!=STRINGARG)

      HError(3019,"HBuild: output lattice file name expected");

   latFn = GetStrArg();

   if (bType == unknown) bType = wordLoop;

   if (saveLatBin) format |= HLAT_LBIN;   

   /* Read the word-list into a Vocab data structure */

   InitVocab(&voc);

   if(ReadDict(wordListFn, &voc)<SUCCESS)

      HError(3013,"HBuild: ReadDict failed");

   switch (bType) {

   case matBiGram:

      if (trace & T_TOP)

         printf("Reading bigram from file %s\n",ipFn);

      bigramLm = ReadLModel(&gstack, ipFn);

      if (bigramLm->type != matBigram)

         HError(3030,"HBuild: File specified is not a matrix bigram");

      lat = ProcessBiGram(&gstack,&voc,bigramLm);

      SaveLattice(lat,latFn,format);

      break;

   case boBiGram:

      if (trace & T_TOP)
示例#5
0
/* LMTransProb_ngram

     return logprob of transition from src labelled word. Also return dest state.
     ngram case
*/
LogFloat LMTrans (LModel *lm, LMState src, LabId wdid, LMState *dest)
{
   NGramLM *nglm;
   LogFloat lmprob;
   lmId hist[NSIZE] = {0};      /* initialise whole array to zero! */
   int i, l;
   NEntry *ne;
   SEntry *se;
   lmId word;

   assert (lm->type == boNGram);
   nglm = lm->data.ngram;

   word = (int) wdid->aux;

   if (word==0 || word>lm->data.ngram->vocSize) {
      HError (-9999, "word %d not in LM wordlist", word);
      *dest = NULL;
      return (LZERO);
   }

   ne = src;
   
   if (!src) {          /* unigram case */
      lmprob = nglm->unigrams[word];
   }
   else {
      /* lookup prob p(word | src) */
      /* try to find pronid in SEntry array */
      se = FindSEntry (ne->se, word, 0, ne->nse - 1);

      assert (!se || (se->word == word));

      if (se)        /* found */
         lmprob = se->prob;
      else {             /* not found */
         lmprob = 0.0;
         l = 0;
         hist[NSIZE-1] = 0;
         for (i = 0; i < NSIZE-1; ++i) {
            hist[i] = ne->word[i];
            if (hist[i] != 0)
               l = i;
         } /* l is now the index of the last (oldest) non zero element */
         
         for ( ; l > 0; --l) {
            if (ne)
               lmprob += ne->bowt;
            hist[l] = 0;   /* back-off: discard oldest word */
            ne = GetNEntry (nglm, hist, FALSE);
            if (ne) {   /* skip over non existing hists. fix for weird LMs */
               /* try to find pronid in SEntry array */
               se = FindSEntry (ne->se, word, 0, ne->nse - 1);
               assert (!se || (se->word == word));
               if (se) { /* found it */
                  lmprob += se->prob;
                  l = -1;
                  break;
               }
            }
         }
         if (l == 0) {          /* backed-off all the way to unigram */
            assert (!se);
            lmprob += ne->bowt;
            lmprob += nglm->unigrams[word];
         }
      }
   }


   /* now determine dest state */
   if (src) {
      ne = (NEntry *) src;
      
      l = 0;
      hist[NSIZE-1] = 0;
      for (i = 1; i < NSIZE-1; ++i) {
         hist[i] = ne->word[i-1];
         if (hist[i] != 0)
            l = i;
      } /* l is now the index of the last (oldest) non zero element */
   }
   else {
      for (i = 1; i < NSIZE-1; ++i)
         hist[i] = 0;
      l = 1;
   }

   hist[0] = word;

   ne = (LMState) GetNEntry (nglm, hist, FALSE);
   for ( ; !ne && (l > 0); --l) {
      hist[l] = 0;              /* back off */
      ne = (LMState) GetNEntry (nglm, hist, FALSE);
   }
   /* if we left the loop because l=0, then ne is still NULL, which is what we want */

   *dest = ne;

#if 0
   printf ("lmprob = %f  dest %p\n", lmprob, *dest);
#endif

   return (lmprob);
}
示例#6
0
/* ReadNGrams: read n grams list from file */
static int ReadNGrams(NGramLM *nglm,int n,int count, Boolean bin)
{
   float prob;
   LabId wdid;
   SEntry *cse;
   char wd[255];
   lmId ndx[NSIZE+1];
   NEntry *ne,*le=NULL;
   int i, g, idx, total;
   unsigned char size, flags=0;

   cse = (SEntry *) New(nglm->heap,count*sizeof(SEntry));
   for (i=1;i<=NSIZE;i++) ndx[i]=0;

   if (trace&T_TIO)
      printf("\nn%1d ",n),fflush(stdout);

   total=0;
   for (g=1; g<=count; g++){
      PROGRESS(g);

      if (bin) {
         size = GetCh (&source);
         flags = GetCh (&source);
      }
      
      prob = GetFloat(bin)*LN10;

      if (n==1) { /* unigram treated as special */
         ReadLMWord(wd);
         wdid = GetLabId(wd, TRUE);
         if (wdid->aux != NULL)
            HError(8150,"ReadNGrams: Duplicate word (%s) in 1-gram list",
                   wdid->name);
         wdid->aux = (Ptr)g;
         nglm->wdlist[g] = wdid;
         nglm->unigrams[g] = prob;
         ndx[0]=g;
      } else {    /* bigram, trigram, etc. */
         for (i=0;i<n;i++) {
            if (bin) {
               if (flags & BIN_ARPA_INT_LMID) {
                  unsigned int ui;
                  if (!ReadInt (&source, (int *) &ui, 1, bin))
                     HError (9999, "ReadNGrams: failed reading int lm word id");
                  idx = ui;
               }
               else {
                  unsigned short us;
                  if (!ReadShort (&source, (short *) &us, 1, bin))
                     HError (9999, "ReadNGrams: failed reading short lm word id at");
                  idx = us;
               }
            }
            else {
               ReadLMWord(wd);
               wdid = GetLabId(wd, FALSE);
               idx = (wdid==NULL?0:(int)wdid->aux);
            }
            if (idx<1 || idx>nglm->vocSize)
               HError(8150,"ReadNGrams: Unseen word (%s) in %dGram",wd,n);
            ndx[n-1-i]=idx;
         }
      }

      total++;
      ne = GetNEntry(nglm,ndx+1,FALSE);
      if (ne == NULL)
         HError(8150,"ReadNGrams: Backoff weight not seen for %dth %dGram",g,n);
      if (ne!=le) {
         if (le != NULL && ne->se != NULL)
            HError(8150,"ReadNGrams: %dth %dGrams out of order",g,n);
         if (le != NULL) {
            if (le->nse==0) {
               le->se=NULL;
            } else {
               qsort(le->se,le->nse,sizeof(SEntry),se_cmp);
            }
         }
         ne->se = cse;
         ne->nse = 0;
         le = ne;
      }
      cse->prob = prob;
      cse->word = ndx[0];
      ne->nse++; cse++;

      /* read back-off weight */
      if (bin) {
         if (flags & BIN_ARPA_HAS_BOWT) {
            ne = GetNEntry(nglm,ndx,TRUE);
            ne->bowt = GetFloat (TRUE)*LN10;
         }
      }
      else {
         SkipWhiteSpace(&source);
         if (!source.wasNewline) {
            ne=GetNEntry(nglm,ndx,TRUE);
            ne->bowt = GetFloat(FALSE)*LN10;
         }
      }
   }

   /* deal with the last accumulated set */
   if (le != NULL) {
      if (le->nse==0) {
         le->se=NULL;
      } else {
         qsort(le->se,le->nse,sizeof(SEntry),se_cmp);
      }
   }

   if (trace&T_TIO)
      printf("\n"),fflush(stdout);

   return(total);
}
示例#7
0
/* WriteNGram: Write n grams to file */
static int WriteNGrams(FILE *file,NGramLM *nglm,int n,float scale)
{
   NEntry *ne,*be,*ce,**neTab;
   SEntry *se;
   LogFloat prob;
   lmId ndx[NSIZE+1];
   int c,i,j,k,N,g=1,hash,neCnt,total;

   if (trace&T_TIO)
      printf("\nn%1d ",n),fflush(stdout);
   fprintf(file,"\n\\%d-grams:\n",n);
   N=VectorSize(nglm->unigrams);

   neTab=(NEntry **) New(&gstack,sizeof(NEntry*)*nglm->counts[0]);

   for (hash=neCnt=0;hash<nglm->hashsize;hash++)
      for (ne=nglm->hashtab[hash]; ne!=NULL; ne=ne->link) {
         for (i=1,ce=ne;i<n;i++)
            if (ne->word[i-1]==0) {
               ce=NULL;
               break;
            }
         if (ce!=NULL)
            for (i=n;i<NSIZE;i++)
               if (ne->word[i-1]!=0) {
                  ce=NULL;
                  break;
               }
         if (ce!=NULL && ce->nse>0)
            neTab[neCnt++]=ce;
      }
   qsort(neTab,neCnt,sizeof(NEntry*),nep_cmp);

   total=0;
   for (c=n;c<=NSIZE;c++) ndx[c]=0;
   for (j=0;j<neCnt;j++) {
      ne=neTab[j];
      for (c=1;c<n;c++) ndx[c]=ne->word[c-1];
      if (ne!=NULL && ne->nse>0) {
         for (i=0,se=ne->se;i<ne->nse;i++,se++) {
            if (trace&T_TIO) {
               if ((g%25000)==0)
                  printf(". "),fflush(stdout);
               if ((g%800000)==0)
                  printf("\n   "),fflush(stdout);
               g++;
            }
            ndx[0]=se->word;

            if (n<nglm->nsize) be=GetNEntry(nglm,ndx,FALSE);
            else be=NULL;
            if (be==NULL || be->nse==0) be=NULL;
            total++;
            if (n==1) prob=nglm->unigrams[se->word];
            else prob=se->prob;
            if (prob*scale<-99.999)
               fprintf(file,"%+6.3f",-99.999);
            else
               fprintf(file,"%+6.4f",prob*scale);
            c='\t';
            for (k=n-1;k>=0;k--)
               if (rawMITFormat)
                  fprintf(file,"%c%s",c,nglm->wdlist[ndx[k]]->name),c=' ';
               else
                  fprintf(file,"%c%s",c,
                          ReWriteString(nglm->wdlist[ndx[k]]->name,
                                        NULL,ESCAPE_CHAR)),c=' ';
            if (be!=NULL)
               fprintf(file,"\t%+6.4f\n",be->bowt*scale);
            else
               fprintf(file,"\n");
         }
      }

   }
   Dispose(&gstack,neTab);
   if (trace&T_TIO)
      printf("\n"),fflush(stdout);
   return(total);
}