Val* Val:: newIth(int ith, Val* oval, bool& stop) { int ithc = oval->vec(ith); if(ithc > 0) stop = true; short nxtI = ithc +1;; //cerr<< "Wnt " << nxtI << "th var on pos " << ith << " of " << *oval<< endl; if(oval->wrd() >= 0) return NULL; //???; double ovalprob = oval->prob(); //if(oval->status == TERMINALVAL) return NULL; Val* nval = ithBst(ith,oval->bsts()).next(nxtI); if(!nval) return NULL; double ovalcompprob = ithBst(ith, oval->bsts()).nth(ithc)->prob(); double nprob = nval->prob(); //cerr << "Its prob is " << nprob << endl; if(nprob < 0) return NULL; Val* ans = new Val(oval); ans->vec(ith) = nxtI; double frac = nprob/ovalcompprob; ans->prob() *= frac; assert(nxtI <= ithBst(ith,ans->bsts()).num()); //cerr << "ith is " << ith << " " << ans->vec().size() << " " // << ans->vec()[ith] << " " << frac << endl; //cerr << "The var is " << *ans << endl; //assert(ans->check()); return ans; }
Bst& MeChart:: bestParseGivenHead(int posInt, const Wrd& wd, Item* itm, FullHist* h, ItmGHeadInfo& ighInfo, Val* cval, Val* gcval) { EdgeSet& es = ighInfo.first; BstMap& atm = ighInfo.second; curVal = cval; gcurVal = gcval; Bst& bst = recordedBPGH(itm, atm, h); if(bst.explored()) { if(printDebug() > 19) { int subfv[MAXNUMFS]; getHt(h, subfv); CntxArray ca(subfv); prDp(); cerr << "bpknown for " << posInt << ", " << wd << ", " << *itm << ") : " << bst.prob() << " " << ca <<endl; } curVal=gcurVal=NULL; return bst; } bst.explored() = true; curVal=gcurVal=NULL; const Term* trm = itm->term(); if(trm->terminal_p()) { Val* nval = new Val; nval->prob() = 1; nval->trm1() = itm->term()->toInt(); nval->wrd1() = itm->word()->toInt(); nval->status = TERMINALVAL; bst.addnth(nval); bst.sum() = nval->prob(); return bst; } if(printDebug() > 10) { prDp(); cerr << "bestParseGivenHead(" << posInt << ", " << wd << ", " << *itm << ")" << endl; } double bestP = 0; double sumP = 0; EdgeSetIter ei = es.begin(); for( ; ei != es.end() ; ei++) { Edge* e = *ei; if(!sufficiently_likely(e)) { continue; } float edgePg = 1; int finish = e->loc(); int effVal = effEnd(finish); if(itm->term()->isRoot()) edgePg = 1; else if(Feature::isLM) edgePg == 1; else if(effVal == 1) edgePg = endFactor; else if(effVal == 0) edgePg = midFactor; h->e = e; if(printDebug() > 20) { prDp(); cerr << "consid " << *e << endl; } gcurVal = gcval; float prob = meRuleProb(e,h); gcurVal=NULL; double nextP = prob * edgePg; double nextPs = nextP; Item* sitm; //LeftRightGotIter gi(e); MiddleOutGotIter gi(e); Val* val = new Val(e, nextPs); val->trm1() = itm->term()->toInt(); val->wrd1() = wd.toInt(); int pos = 0; depth++; h = h->extendByEdge(e); bool zeroProb = false; while( gi.next(sitm,pos) ) { //cerr << "Looking at " << *sitm << endl; if(zeroProb) { h = h->extendBySubConstit(); continue; } if(sitm->term() == Term::stopTerm) { h = h->extendBySubConstit(); continue; } if(pos == 0) { h->preTerm = posInt; h->hd = &wd; ItmGHeadInfo& ighi = sitm->posAndheads()[posInt][wd]; Bst& bst2 = bestParseGivenHead(posInt,wd,sitm,h,ighi,val,cval); curVal = gcurVal = NULL; curDir = -1; if(bst2.empty()) { zeroProb = true; } val->extendTrees(bst2,pos); nextPs *= bst2.sum(); } else { Bst& bst2 = bestParse(sitm,h,val,cval,pos); if(bst2.empty()) { zeroProb = true; } val->extendTrees(bst2,pos); nextPs *= bst2.sum(); } if(printDebug() > 39) { prDp(); cerr << "FullHist from " << *h; } h = h->extendBySubConstit(); if(printDebug() > 39) cerr << " -> " << *h << endl; } if(!zeroProb) bst.push(val); if(printDebug() > 20) { prDp(); cerr << "P(" << *e << " | " << wd << " ) = " ; cerr << bestP; cerr << "\n"; } depth--; sumP += nextPs; h->retractByEdge(); if(printDebug() > 20) { prDp(); cerr << "Val: " << *val << endl; } } Val* vbest = bst.pop(); if(vbest) bst.addnth(vbest); bst.sum() = sumP; if(printDebug() > 10) { prDp(); cerr << "Bestpgh for "<<*itm << ", " << wd << " = " << bst.prob()<< endl; } return bst; }
Bst& MeChart:: bestParse(Item* itm, FullHist* h, Val* cval, Val* gcval, int cdir) { curVal = cval; gcurVal = gcval; curDir = cdir; Bst& bst = recordedBP(itm, h); curVal = gcurVal = NULL; curDir = -1; if(bst.explored()) { if(printDebug() > 19) { prDp(); cerr << "already known bestParse(" << *itm << ", ...) has p = " << bst.prob() << endl; } return bst; } if(printDebug() > 10) { prDp(); cerr << "bestParse(" << *itm << ", ...)" << endl; } bst.explored() = true; //David McClosky bug; int itermInt = itm->term()->toInt(); PosMap& pm = itm->posAndheads(); PosIter pi = pm.begin(); ECString bestW; for( ; pi != pm.end() ; pi++ ) { int posInt = (*pi).first; if(printDebug() > 16) { prDp(); cerr << "consider Pos(" << *itm << ") = " << posInt << endl; } HeadMap& hm = (*pi).second; /* we are using collected counts for p(u|t) */ float hposprob = 1; /* if we have reached a preterminal, then termInt == posInt and p(posInt|termInt) == 1 */ if( itermInt != posInt) { curVal = cval; gcurVal = gcval; curDir = cdir; hposprob = meProb(posInt, h, UCALC); if(hposprob == 0) hposprob = .00001; //??? this can happen; curVal = gcurVal = NULL; curDir = -1; if(printDebug() > 16) { prDp(); cerr << "p(pos) = " << hposprob << endl; } } h->preTerm = posInt; HeadIter hi = hm.begin(); for( ;hi != hm.end();hi++) { const Wrd& subhw = (*hi).first; int wrdInt = subhw.toInt(); ECString subh = subhw.lexeme(); if(printDebug() > 16) { prDp(); cerr << "consider head(" << *itm << ") = " << subh << endl; } float hprob = 0; if(wrdInt >= 0 && wrdInt <= lastKnownWord) { hprob = pCapgt(&subhw,posInt); hprob *= (1 - pHugt(posInt)); curVal = cval; gcurVal = gcval; curDir = cdir; float hprob2 = meHeadProb(wrdInt, h); curVal = gcurVal = NULL; curDir = -1; hprob *= hprob2; if(hprob < 0) { cerr << posInt << " " << pHugt(posInt) <<" "<<hprob2 << endl; assert(hprob >=0); } } //hprob can be zero if lower case NNPS. if(wrdInt > lastKnownWord || hprob == 0) { hprob = psutt(&subhw,posInt); } if(printDebug() > 16) { prDp(); cerr << "p(hd) = "<< hprob << endl; } float hhprob = (hposprob * hprob); if(hhprob < 0) { cerr << hposprob << " " << hprob << endl; assert(hhprob >= 0); } h->hd = &subhw; Bst& bst2 = bestParseGivenHead(posInt,subhw,itm,h,(*hi).second,cval,gcval); if(bst2.empty()) continue; Val* nval = new Val(); Val* oldval0 = bst2.nth(0); nval->prob() = oldval0->prob()*hhprob; nval->bsts().push_back(&bst2); nval->status = EXTRAVAL; bst.push(nval); bst.sum() += bst2.sum()*hhprob; } } Val* nbest = bst.pop(); if(nbest) bst.addnth(nbest); if(printDebug() > 10) { prDp(); cerr << "Bestp for " << *itm << " = " << bst.prob() <<endl; } return bst; }
/* the function called by each thread is "mainLoop" */ void* mainLoop(void* arg) { loopArg *loopA = (loopArg*)arg; istream* testSStream = loopA->inpt; ostream* pstatStream = loopA->outpt; int id = loopA->id; double log600 = log2(600.0); PrintStack printStack; for( ; ; ) { InputTree correct; InputTree* cuse; /* first lock to read in the material */ pthread_mutex_lock(&readlock); if( !*testSStream ) { pthread_mutex_unlock(&readlock); break; } *testSStream >> correct; if( !*testSStream ){ pthread_mutex_unlock(&readlock); break; } totWords += correct.length()+1; int locCount = sentenceCount++; list<ECString> wtList; correct.make(wtList); SentRep sr( wtList ); // used in precision calc ExtPos extPos; if(params.extPosIfstream) extPos.read(params.extPosIfstream,sr); pthread_mutex_unlock(&readlock); cuse = &correct; int len = correct.length(); if(len > params.maxSentLen) continue; //cerr << "Len = " << len << endl; /* if( !params.field().in(sentenceCount) ) { sentenceCount++; continue; } if(sentenceCount < -1) { sentenceCount++; continue; } sentenceCount++; */ vector<ECString> poslist; correct.makePosList(poslist); ScoreTree sc; sc.setEquivInts(poslist); MeChart* chart = new MeChart( sr,extPos,id ); chart->parse( ); Item* topS = chart->topS(); if(!topS) { cerr << "Parse failed" << endl; cerr << correct << endl; error(" could not parse "); delete chart; continue; } // compute the outside probabilities on the items so that we can // skip doing detailed computations on the really bad ones chart->set_Alphas(); Bst& bst = chart->findMapParse(); if( bst.empty()) error( "mapProbs did not return answer"); float bestF = -1; int i; int numVersions = 0; Link diffs(0); //cerr << "Need num diff: " << Bchart::Nth << endl; printStruct printS; printS.sentenceCount = locCount; printS.numDiff = 0; for(numVersions = 0 ; ; numVersions++) { short pos = 0; Val* val = bst.next(numVersions); if(!val) { //cerr << "Breaking" << endl; break; } InputTree* mapparse = inputTreeFromBsts(val,pos,sr); bool isU; int dummy = 0; diffs.is_unique(mapparse, isU, dummy); // cerr << "V " << isU << " " << numVersions << *mapparse << endl; if(isU) { printS.probs.push_back(val->prob()); printS.trees.push_back(mapparse); printS.numDiff++; } else { delete mapparse; } if(printS.numDiff >= Bchart::Nth) break; if(numVersions > 20000) break; } ParseStats* locPst = new ParseStats[Bchart::Nth]; ParseStats bestPs; for(i = 0 ; i <printS.numDiff ; i++) { InputTree *mapparse = printS.trees[i]; assert(mapparse); sc.trips.clear(); ParseStats pSt; sc.recordGold(cuse,pSt); sc.precisionRecall(mapparse,pSt); float newF = pSt.fMeasure(); cerr << printS.sentenceCount << "\t" << newF << endl; if(newF > bestF) { bestF = newF; bestPs = pSt; } if(histPoints[i]) { locPst[i] += bestPs; } } if(printS.numDiff < Bchart::Nth) { for(i = printS.numDiff ; i < Bchart::Nth ; i++) { if(histPoints[i]) locPst[i] += bestPs; } } pthread_mutex_lock(&scorelock); for(i = 0 ; i < Bchart::Nth ; i++) totPst[i]+=locPst[i]; pthread_mutex_unlock(&scorelock); int numPrinted; /* put the sentence with which we just finished at the end of the printStack*/ printStack.push_back(printS); PrintStack::iterator psi = printStack.begin(); /* now look at each item from the front of the print stack to see if it should be printed now */ pthread_mutex_lock(&writelock); for( numPrinted =0; psi != printStack.end(); numPrinted++ ) { printStruct& pstr=(*psi); if(pstr.sentenceCount != printCount) break; *pstatStream << pstr.sentenceCount << "\t" << pstr.numDiff << "\n"; printCount++; for(i = 0 ; i < pstr.numDiff ; i++) { InputTree* mapparse = pstr.trees[i]; assert(mapparse); double logP =log2(pstr.probs[i]); logP -= (sr.length()*log600); *pstatStream << logP << "\n"; if(Bchart::prettyPrint) *pstatStream << *mapparse << "\n\n"; else { mapparse->printproper(*pstatStream); *pstatStream << "\n"; } delete mapparse; } *pstatStream << endl; psi++; } pthread_mutex_unlock(&writelock); for(i = 0 ; i < numPrinted ; i++) printStack.pop_front(); if(Feature::isLM) { double lgram = log2(bst.sum()); lgram -= (sr.length()*log600); double pgram = pow(2,lgram); double iptri = chart->triGram();; double ltri = (log2(iptri)-sr.length()*log600); double ptri = pow(2.0,ltri); double pcomb1 = (0.667 * pgram)+(0.333 * ptri); double lcom1 = log2(pcomb1); totGram -= lgram; totTri -= ltri; totMix -= lcom1; if(locCount%10 == 9) { cerr << locCount << "\t"; cerr << pow(2.0,totGram/(double)totWords); cerr <<"\t" << pow(2.0,totTri/(double)totWords); cerr << "\t" << pow(2.0,totMix/(double)(totWords)); cerr << endl; } } if(locCount%50 == 1) { cerr << sentenceCount << "\t"; for(int i = 0 ; i < Bchart::Nth ; i++) if(histPoints[i]) { cerr << i << " " << totPst[i].fMeasure() << "\t"; } cerr << endl; } delete chart; delete [] locPst; } return 0; }