double Pst:: psutt(const ECString& shU, int t, int word_num) { //cerr << "Unknown word: " << shU << " for tag: " << t << endl; double ans = pHugt(t); //cerr << "pHugt = " << ans << endl; if(ans == 0) return 0; double phyp = pHypgt(shU,t); ans *= phyp; //cerr << "pHypgt = " << phyp << endl; double phcp = pCapgt(shU,t, word_num); ans *= phcp; ans *= .000001; if(Term::fromInt(t)->openClass()) { char temp[1024]; ECString sh(langAwareToLower(shU.c_str(),temp)); float phegt = pegt(sh,t); if(phegt == 0) phegt = .00001; //if(phegt == 0) phegt = .00005; //cerr << "pegt( " << sh << " | " << t << " ) = " << phegt << endl; ans *= phegt; } else ans *= .00000001; //cerr << "psutt( " << shU << " | " << t << " ) = " << ans << endl; return ans; }
void Bchart:: makepUgT(ECString path) { int i; int nm = Term::lastTagInt()+1; ECString pth(path); for(int f = 0 ; f < 20 ; f++) { ECString pUString(pth); pUString += "pUgT.txt"; ifstream pUstream(pUString.c_str()); if(!pUstream) { cerr << "Could not find " << pUString << endl; assert(pUstream); } for( i = 0 ; i < nm ; i++ ) { int t; pUstream >> t; float p; if(f == 0) { pUstream >> p; pHugt(t) = p; //cerr << "set pHugt " << t << " = " << p << endl; pUstream >> p; pHcapgt(t) = p; pUstream >> p; pHhypgt(t) = p; } else { pUstream >> p; pHugt(t) += p; //cerr << "set pHugt " << t << " = " << p << endl; pUstream >> p; pHcapgt(t) += p; pUstream >> p; pHhypgt(t) += p; } }
double Pst:: psktt(const ECString& shU, int t, int word_num) { char temp[1024]; ECString sh(langAwareToLower(shU.c_str(), temp)); double ans = pHst(sh, t); double phcp = pCapgt(shU,t, word_num); ans *= phcp; double put = pHugt(t); ans *= (1-put); //cerr << "psktt( " << shU << " | " << t << " ) = " << ans << endl; return ans; }
void Pst:: readTermProbs(ECString& pTString, ECString& pUstring) { ifstream pTstream(pTString.c_str()); assert(pTstream); ignoreComment(pTstream); ifstream pUstream(pUstring.c_str()); assert(pUstream); ignoreComment(pUstream); int i, j; for( i = 0 ; i <= Term::lastNTInt() ; i++ ) { int t; pUstream >> t; float p; pUstream >> p; pHugt(t) = p; pUstream >> p; if(p == 0) p = .00001; //Anything might be capitalized; pHcapgt(t) = p; pUstream >> p; pHhypgt(t) = p; } int numpT; pTstream >> numpT; pHegt_ = new Phegt[numpT]; egtSize_ = numpT; i = 0; while(pTstream) { int t; char e0; char e1; float p; pTstream >> t; if(!pTstream) break; assert(i < numpT); pTstream >> e0; pTstream >> e1; pTstream >> p; pHegt_[i].t = t; pHegt_[i].e[0] = e0; pHegt_[i].e[1] = e1; pHegt_[i].p = p; i++; } }
list<double> Pst:: wordPlistConstruct(const ECString& head, int word_num) { list<double> ans; char temp[1024]; ECString headL(langAwareToLower(head.c_str(), temp)); const WordInfo* wi = useHeadC( headL ); if( wi ) { int sz = wi->stSize(); for( int i = 0 ; i < sz ; i ++ ) { Phsgt& wti = wi->st_[i]; int tInt = wti.term; if(tInt > Term::lastTagInt()) continue; double prob = psktt(head,tInt,word_num); ans.push_back(tInt); ans.push_back(prob); if(prob == 0) cerr << "Warning, prob = 0 for word = " << head << " and pos = " << tInt << endl; //cerr << "wordPlist: " << word << "\t" << tInt // << "\t" << prob << endl; } } else { for(int i = 0 ; i <= Term::lastTagInt() ; i++) { double phut = pHugt(i); if(phut == 0) continue; double prob = psutt(head,i,word_num); ans.push_back(i); ans.push_back(prob); } } return ans; }
Bst& MeChart:: bestParse(Item* itm, FullHist* h, Val* cval, Val* gcval, int cdir) { curVal = cval; gcurVal = gcval; curDir = cdir; Bst& bst = recordedBP(itm, h); curVal = gcurVal = NULL; curDir = -1; if(bst.explored()) { if(printDebug() > 19) { prDp(); cerr << "already known bestParse(" << *itm << ", ...) has p = " << bst.prob() << endl; } return bst; } if(printDebug() > 10) { prDp(); cerr << "bestParse(" << *itm << ", ...)" << endl; } bst.explored() = true; //David McClosky bug; int itermInt = itm->term()->toInt(); PosMap& pm = itm->posAndheads(); PosIter pi = pm.begin(); ECString bestW; for( ; pi != pm.end() ; pi++ ) { int posInt = (*pi).first; if(printDebug() > 16) { prDp(); cerr << "consider Pos(" << *itm << ") = " << posInt << endl; } HeadMap& hm = (*pi).second; /* we are using collected counts for p(u|t) */ float hposprob = 1; /* if we have reached a preterminal, then termInt == posInt and p(posInt|termInt) == 1 */ if( itermInt != posInt) { curVal = cval; gcurVal = gcval; curDir = cdir; hposprob = meProb(posInt, h, UCALC); if(hposprob == 0) hposprob = .00001; //??? this can happen; curVal = gcurVal = NULL; curDir = -1; if(printDebug() > 16) { prDp(); cerr << "p(pos) = " << hposprob << endl; } } h->preTerm = posInt; HeadIter hi = hm.begin(); for( ;hi != hm.end();hi++) { const Wrd& subhw = (*hi).first; int wrdInt = subhw.toInt(); ECString subh = subhw.lexeme(); if(printDebug() > 16) { prDp(); cerr << "consider head(" << *itm << ") = " << subh << endl; } float hprob = 0; if(wrdInt >= 0 && wrdInt <= lastKnownWord) { hprob = pCapgt(&subhw,posInt); hprob *= (1 - pHugt(posInt)); curVal = cval; gcurVal = gcval; curDir = cdir; float hprob2 = meHeadProb(wrdInt, h); curVal = gcurVal = NULL; curDir = -1; hprob *= hprob2; if(hprob < 0) { cerr << posInt << " " << pHugt(posInt) <<" "<<hprob2 << endl; assert(hprob >=0); } } //hprob can be zero if lower case NNPS. if(wrdInt > lastKnownWord || hprob == 0) { hprob = psutt(&subhw,posInt); } if(printDebug() > 16) { prDp(); cerr << "p(hd) = "<< hprob << endl; } float hhprob = (hposprob * hprob); if(hhprob < 0) { cerr << hposprob << " " << hprob << endl; assert(hhprob >= 0); } h->hd = &subhw; Bst& bst2 = bestParseGivenHead(posInt,subhw,itm,h,(*hi).second,cval,gcval); if(bst2.empty()) continue; Val* nval = new Val(); Val* oldval0 = bst2.nth(0); nval->prob() = oldval0->prob()*hhprob; nval->bsts().push_back(&bst2); nval->status = EXTRAVAL; bst.push(nval); bst.sum() += bst2.sum()*hhprob; } } Val* nbest = bst.pop(); if(nbest) bst.addnth(nbest); if(printDebug() > 10) { prDp(); cerr << "Bestp for " << *itm << " = " << bst.prob() <<endl; } return bst; }