double Pst:: pstt(ECString& shU, int t, int word_num) { char temp[1024]; ECString sh(langAwareToLower(shU.c_str(), temp)); const Term* tTerm = Term::fromInt(t); double phst = pHst(sh, t); double ans; if(phst > 0) ans = psktt(shU, t, word_num); else ans = psutt(shU, t, word_num); return ans; }
double Pst:: psutt(const ECString& shU, int t, int word_num) { //cerr << "Unknown word: " << shU << " for tag: " << t << endl; double ans = pHugt(t); //cerr << "pHugt = " << ans << endl; if(ans == 0) return 0; double phyp = pHypgt(shU,t); ans *= phyp; //cerr << "pHypgt = " << phyp << endl; double phcp = pCapgt(shU,t, word_num); ans *= phcp; ans *= .000001; if(Term::fromInt(t)->openClass()) { char temp[1024]; ECString sh(langAwareToLower(shU.c_str(),temp)); float phegt = pegt(sh,t); if(phegt == 0) phegt = .00001; //if(phegt == 0) phegt = .00005; //cerr << "pegt( " << sh << " | " << t << " ) = " << phegt << endl; ans *= phegt; } else ans *= .00000001; //cerr << "psutt( " << shU << " | " << t << " ) = " << ans << endl; return ans; }
int tree_grandparent_head(TreeHist* treeh) { InputTree* tree = treeh->tree; InputTree* pt = tree->parent(); static int topInt = -1; if(topInt < 0) { ECString temp("^^"); topInt = Pst::get(temp)->toInt(); } if(!pt) return topInt; pt = pt->parent(); if(!pt) return topInt; char temp[1024]; ECString wrdStr(langAwareToLower(pt->head().c_str(),temp)); const WordInfo* wi = Pst::get(wrdStr); if(!wi) { cerr << *tree << endl; assert(wi); } int ans = wi->toInt(); assert(ans >= 0); return ans; }
double Pst:: pCapgt(const ECString& shU, int t, int word_num) { if(word_num == 0) return 1; //cerr << "pCapgt = " << pcap << endl; if(shU.length() < 2) return 1; //ignore words of length 1; char temp[1024]; ECString sh(langAwareToLower(shU.c_str(),temp)); bool cap = false; if(shU[0] != sh[0] && shU[1] == sh[1]) cap = true; double pcap = pHcapgt(t); return cap ? pcap : (1 - pcap); }
double Pst:: psktt(const ECString& shU, int t, int word_num) { char temp[1024]; ECString sh(langAwareToLower(shU.c_str(), temp)); double ans = pHst(sh, t); double phcp = pCapgt(shU,t, word_num); ans *= phcp; double put = pHugt(t); ans *= (1-put); //cerr << "psktt( " << shU << " | " << t << " ) = " << ans << endl; return ans; }
void addWwData(InputTree* tree) { ECString wTagNm = tree->term(); const Term* trm = Term::get(wTagNm); int lhsInt = trm->toInt(); totCounts[lhsInt]++; if( tree->word() != "" ) { ECString hdLexU(tree->word()); char temp[512]; ECString hdLex(langAwareToLower(hdLexU.c_str(),temp)); int len = hdLex.length(); const WordInfo* wi = Pst::get(hdLex); //???; if (!wi) cerr << "Couldn't find entry for word '" << hdLex << "' in pSgT.txt" << endl; assert(wi); /* Ignore words very close to start of sentence, those that are of length 1, and those who's capitalization is ambiguous. */ if(tree->start() >= 2 && len > 1 &&!(hdLex[0] != hdLexU[0] && hdLex[1] != hdLexU[1])) { posCounts[lhsInt]++; if(hdLex[0] != hdLexU[0] && hdLex[1] == hdLexU[1]) { posCapCounts[lhsInt]++; } } posDenoms[lhsInt]++; if(wi->c() <= 2) { posUCounts[lhsInt]++; const char* hyppos = strpbrk(hdLex.c_str(), "-"); if(hyppos) posDashCounts[lhsInt]++; } return; } InputTrees& st = tree->subTrees(); InputTrees::iterator subTreeIter= st.begin(); InputTree *subTree; for( ; subTreeIter != st.end() ; subTreeIter++ ) { subTree = *subTreeIter; addWwData(subTree); } }
int tree_watpos(int pos) { if(pos < 0) { return nullWordInt; } ECString wrd = sentence[pos]->head(); char tmp[1024]; ECString wrdl=langAwareToLower(wrd.c_str(), tmp); const WordInfo* wi = Pst::get(wrdl); assert(wi); int ans = wi->toInt(); assert(ans >= 0); return ans; }
int headFromTree(InputTree* tree) { char temp[1024]; string wrdStr(langAwareToLower(tree->head().c_str(), temp)); const WordInfo* wi = Pst::get(wrdStr); if(!wi) { if(Feat::Usage == PARSE) return -1; cerr << "Could not find " << wrdStr << endl; assert(wi); } int ans = wi->toInt(); assert(ans >= 0); return ans; }
void incrWordData(int lhsInt, ECString wupper) { char temp[1024]; ECString w(langAwareToLower(wupper.c_str(), temp)); numTerm[lhsInt]++; WordMap::iterator wmi = wordMap.find(w); if(wmi == wordMap.end()) { wordMap[w][lhsInt] = 1; return; } PosD& posd = (*wmi).second; PosD::iterator pdi = posd.find(lhsInt); if(pdi == posd.end()) { posd[lhsInt] = 1; } else (*pdi).second++; }
list<double> Pst:: wordPlistConstruct(const ECString& head, int word_num) { list<double> ans; char temp[1024]; ECString headL(langAwareToLower(head.c_str(), temp)); const WordInfo* wi = useHeadC( headL ); if( wi ) { int sz = wi->stSize(); for( int i = 0 ; i < sz ; i ++ ) { Phsgt& wti = wi->st_[i]; int tInt = wti.term; if(tInt > Term::lastTagInt()) continue; double prob = psktt(head,tInt,word_num); ans.push_back(tInt); ans.push_back(prob); if(prob == 0) cerr << "Warning, prob = 0 for word = " << head << " and pos = " << tInt << endl; //cerr << "wordPlist: " << word << "\t" << tInt // << "\t" << prob << endl; } } else { for(int i = 0 ; i <= Term::lastTagInt() ; i++) { double phut = pHugt(i); if(phut == 0) continue; double prob = psutt(head,i,word_num); ans.push_back(i); ans.push_back(prob); } } return ans; }
Bchart:: Bchart(SentRep & sentence, int id) : ChartBase( sentence,id ), depth(0), curDir(-1), gcurVal(NULL), alreadyPoppedNum( 0 ) { pretermNum = 0; heap = new EdgeHeap(); int len = sentence.length(); lastWord[id]=lastKnownWord; int i,j; assert(len <= MAXSENTLEN); for(i = 0 ; i < len ; i++) { ECString wl = langAwareToLower(sentence[i].lexeme()); int val = wtoInt(wl); sentence_[i].toInt() = val; } for(i = 0 ; i < MAXSENTLEN ; i++) for(j = 0 ; j < MAXSENTLEN ; j++) curDemerits_[i][j] = 0; }
void addWwData(InputTree* tree) { bool okSit = true; if( tree->word() != "" ) { ECString wTagNm = tree->term(); const Term* trm = Term::get(wTagNm); int lhsInt = trm->toInt(); if(trm->openClass()) { ECString hdLexU(tree->word()); char temp[512]; ECString hdLex(langAwareToLower(hdLexU.c_str(),temp)); int len = hdLex.length(); if(len >= 4) { ECString e=lastCharacter(hdLex); // if the current count for lhs and e == 0, this is new; //cout<<hdLex<<endl; const WordInfo* wi = Pst::get(hdLex); //???; if(!wi) { assert(wi); } if(wi->c() <= 4) { incrEndData(lhsInt, e); numTerm[lhsInt]++; } } }//if openClass return; } /*ECString fixedTerm(tree->term()); if(fixedTerm == "") fixedTerm = "S1"; const Term* lhs = Term::get(fixedTerm); /* If we cannot recognize the term, don't abort, just warn and do not create a rule here or one level up. */ /*if(!lhs) { lhs = Term::get("GARBAGE"); okSit = false; cerr << "Garbage term: " << tree->term() << endl; } */ InputTrees& st = tree->subTrees(); InputTrees::iterator subTreeIter= st.begin(); InputTree *subTree; for( ; subTreeIter != st.end() ; subTreeIter++ ) { subTree = *subTreeIter; addWwData(subTree); } /* int lhsInt = lhs->toInt(); int k, l; k = Term::get(tree->headTree()->term())->toInt(); l = lhsInt - 1 - Term::lastTagInt(); data[k][l]++; return lhs;*/ }