int parseline(istream& inp, int Order,ngram& ng,float& prob,float& bow) { const char* words[1+ LMTMAXLEV + 1 + 1]; int howmany; char line[MAX_LINE]; inp.getline(line,MAX_LINE); if (strlen(line)==MAX_LINE-1) { cerr << "parseline: input line exceed MAXLINE (" << MAX_LINE << ") chars " << line << "\n"; exit(1); } howmany = parseWords(line, words, Order + 3); if (!(howmany == (Order+ 1) || howmany == (Order + 2))) assert(howmany == (Order+ 1) || howmany == (Order + 2)); //read words ng.size=0; for (int i=1; i<=Order; i++) ng.pushw(strcmp(words[i],"<unk>")?words[i]:ng.dict->OOV()); //read logprob/code and logbow/code assert(sscanf(words[0],"%f",&prob)); if (howmany==(Order+2)) assert(sscanf(words[Order+1],"%f",&bow)); else bow=0.0; //this is log10prob=0 for implicit backoff return 1; }
double normcache::get(ngram ng,int size,double& value) { if (size==2) { if (*ng.wordp(2) < cachesize[0]) return value=cache[0][*ng.wordp(2)]; else return value=0; } else if (size==3) { if (ngt->get(ng,size,size-1)) { hit++; // cerr << "hit " << ng << "\n"; return value=cache[1][ng.freq]; } else { miss++; return value=0; } } return 0; }
double mdiadaptlm::prob(ngram ng,int size,double& fstar,double& lambda, double& bo) { double pr; #ifdef MDIADAPTLM_CACHE_ENABLE //probcache hit if (size<=max_caching_level && probcache[size] && ng.size>=size && probcache[size]->get(ng.wordp(size),pr)) return pr; #endif //probcache miss mdiadaptlm::bodiscount(ng,size,fstar,lambda,bo); if (fstar >1.0000001 || lambda >1.0000001) { cerr << "wrong probability: " << ng << " , size " << size << " , fstar " << fstar << " , lambda " << lambda << "\n"; exit(1); } if (backoff) { if (size>1) { if (fstar>0) pr=fstar; else { if (lambda<1) pr = lambda/bo * prob(ng,size-1); else { assert(lambda < 1.00000001); pr = prob(ng,size-1); } } } else pr = fstar; } else { //interpolation if (size>1) pr = fstar + lambda * prob(ng,size-1); else pr = fstar; } #ifdef MDIADAPTLM_CACHE_ENABLE //probcache insert if (size<=max_caching_level && probcache[size] && ng.size>=size) probcache[size]->add(ng.wordp(size),pr); #endif return pr; }
double normcache::put(ngram ng,int size,double value) { if (size==2) { if (*ng.wordp(2)>= maxcache[0]) expand(0); cache[0][*ng.wordp(2)]=value; cachesize[0]++; return value; } else if (size==3) { if (ngt->get(ng,size,size-1)) return cache[1][ng.freq]=value; else { ngram histo(dict,2); *histo.wordp(1)=*ng.wordp(2); *histo.wordp(2)=*ng.wordp(3); histo.freq=cachesize[1]++; if (cachesize[1]==maxcache[1]) expand(1); ngt->put(histo); return cache[1][histo.freq]=value; } } return 0; }
//creates the ngramtable on demand from the sublm tables int mixture::get(ngram& ng,int n,int lev) { if (usefulltable) { return ngramtable::get(ng,n,lev); } //free current tree resetngramtable(); //get 1-word prefix from ng ngram ug(dict,1); *ug.wordp(1)=*ng.wordp(ng.size); //local ngram to upload entries ngram locng(dict,maxlevel()); //allocate subtrees from sublm for (int i=0; i<numslm; i++) { ngram subug(sublm[i]->dict,1); subug.trans(ug); if (sublm[i]->get(subug,1,1)) { ngram subng(sublm[i]->dict,maxlevel()); *subng.wordp(maxlevel())=*subug.wordp(1); sublm[i]->scan(subug.link,subug.info,1,subng,INIT,maxlevel()); while(sublm[i]->scan(subug.link,subug.info,1,subng,CONT,maxlevel())) { locng.trans(subng); put(locng); } } } return ngramtable::get(ng,n,lev); }
double interplm::unigrWB(ngram ng) { return ((double)(dict->freq(*ng.wordp(1))+epsilon))/ ((double)dict->totfreq() + (double) dict->size() * epsilon); }
double mdiadaptlm::txclprob(ngram ng,int size) { double fstar,lambda; if (size>1) { mdiadaptlm::discount(ng,size,fstar,lambda); return fstar + lambda * txclprob(ng,size-1); } else { double freq=1; if ((*ng.wordp(1)!=dict->oovcode()) && get(ng,1,1)) freq+=ng.freq; double N=totfreq()+dict->dub()-dict->size(); return freq/N; } }