int ccIndFromTree(InputTree* tree) { InputTreesIter subTreeIter = tree->subTrees().begin(); ECString trmNm = tree->term(); bool sawComma = false; bool sawColen = false; bool sawCC = false; bool sawOTHNT = false; int numTrm = 0; int pos = 0; const Term* trm = Term::get(trmNm); int tint = trm->toInt(); /*Change next line to indicate which non-terminals get specially marked to indicate that they are conjoined together */ if(!trm->isNP() && !trm->isS() && !trm->isVP()) return tint; for( ; subTreeIter != tree->subTrees().end() ; subTreeIter++ ) { InputTree* subTree = *subTreeIter; ECString strmNm = subTree->term(); const Term* strm = Term::get(strmNm); if(pos != 0 && strm->isCC()) sawCC = true; else if(strmNm == trmNm) numTrm++; else if(pos != 0 && strm->isComma()) sawComma = true; else if(pos != 0 && strm->isColon()) sawColen = true; else if(!strm->terminal_p()) sawOTHNT = true; pos++; } if(trmNm == "NP" && numTrm == 2 && !sawCC) return Term::lastNTInt()+1; if((sawComma || sawColen || sawCC) && numTrm >= 2) return tint+Term::lastNTInt(); return tint; }
int main(int argc, char *argv[]) { ECArgs args(argc, argv); params.init(args); ECString path(args.arg(0)); generalInit(path); // we don't use sentenceCount since it treeLogProb may parse the // sentence multiple times int index = 0; while (true) { if (!cin) { break; } InputTree correct; cin >> correct; int len = correct.length(); if (len == 0) { break; } if (len > params.maxSentLen) { continue; } double logProb; try { logProb = treeLogProb(&correct); } catch (ParserError) { logProb = 0; } cout << index << "\t" << logProb << endl; index++; } return 0; }
int is_effEnd(InputTree* tree, InputTree* child) { if(!tree) return 1; const Term* trm = Term::get(tree->term()); if(trm->isRoot()) return 1; InputTreesIter iti = tree->subTrees().begin(); for( ; ; iti++) { assert(iti != tree->subTrees().end()); InputTree* nxt = (*iti); assert(nxt); if(nxt != child) continue; iti++; if(iti == tree->subTrees().end()) return is_effEnd(tree->parent(),tree); nxt = (*iti); ECString ntrmNm = nxt->term(); const Term* ntrm = Term::get(ntrmNm); if(ntrm== Term::stopTerm) return is_effEnd(tree->parent(),tree); if(ntrm->isColon() || ntrm->isFinal()) return 1; if(ntrm->isComma()) return 0; iti++; if(iti == tree->subTrees().end()) return 0; nxt = (*iti); if(nxt->term() == "''") return 1; return 0; } error("should not get here"); return 0; }
int tree_grandparent_head(TreeHist* treeh) { InputTree* tree = treeh->tree; InputTree* pt = tree->parent(); static int topInt = -1; if(topInt < 0) { ECString temp("^^"); topInt = Pst::get(temp)->toInt(); } if(!pt) return topInt; pt = pt->parent(); if(!pt) return topInt; char temp[1024]; ECString wrdStr(langAwareToLower(pt->head().c_str(),temp)); const WordInfo* wi = Pst::get(wrdStr); if(!wi) { cerr << *tree << endl; assert(wi); } int ans = wi->toInt(); assert(ans >= 0); return ans; }
int headPosFromTree(InputTree* tree) { int ansPriority = 10; ECString lhsString(tree->term()); if(lhsString == "") lhsString = "S1"; int pos = -1; int ans = -1; ConstInputTreesIter subTreeIter = tree->subTrees().begin(); InputTree *subTree; for( ; subTreeIter != tree->subTrees().end() ; subTreeIter++ ) { subTree = *subTreeIter; pos++; ECString rhsString(subTree->term()); int nextPriority = headPriority(lhsString, rhsString, ansPriority); if(nextPriority <= ansPriority) { ans = pos; ansPriority = nextPriority; } } return ans; }
void goThroughSents(InputTree* trainingData[1301], int sc) { int sentenceCount; for(sentenceCount = 0 ; sentenceCount < sc ; sentenceCount++) { InputTree* par = trainingData[sentenceCount]; //if(sentenceCount%50 == 1) //cerr << sentenceCount << endl; makeSent(par); gatherFfCounts(par,0); if(whichInt == TTCALC) { list<InputTree*> dummy2; InputTree stopInputTree(par->finish(),par->finish(), whichInt==TTCALC ? "" : "^^", "STOP","", dummy2,NULL,NULL); stopInputTree.headTree() = &stopInputTree; TreeHist treeh(&stopInputTree,0); treeh.hpos = 0; callProcG(&treeh); } } }
int tree_grandparent_pos(TreeHist* treeh) { static int stopint = 0; if(!stopint) { ECString stopnm("STOP"); stopint = Term::get(stopnm)->toInt(); } InputTree* tree = treeh->tree; InputTree* par1 = tree->parent(); if(!par1) return stopint; InputTree* par = par1->parent(); if(!par) return stopint; const ECString& trmStr = par->hTag(); const Term* trm = Term::get(trmStr); assert(trm); if(!trm->terminal_p()) { cerr << "Bad head Part of Speech: " << *trm << " in " <<endl; cerr << *tree << endl; assert(trm->terminal_p()); } return trm->toInt(); }
int tree_noopenQl(TreeHist* treeh) { InputTree* tree = treeh->tree; int pos = treeh->pos; int hpos = treeh->hpos; InputTree *subTree; InputTrees::reverse_iterator subTreeIter = tree->subTrees().rbegin(); int i = tree->subTrees().size()-1; bool sawOpen = false; bool sawClosed = false; for( ; ; subTreeIter++ ) { if(i == pos) break; if(i > hpos) { i-- ; continue; } assert(i >= 0); subTree = *subTreeIter; const Term* trm = Term::get(subTree->term()); if(trm->isClosed() && !sawOpen) sawOpen = true; else if(trm->isOpen() && sawOpen) sawOpen = false; i--; } if(sawOpen) return 0; else return 1; }
void UnitRules:: gatherData(InputTree* tree) { const Term* trm = Term::get(tree->term()); assert(trm); int parInt = trm->toInt(); int rparI = parInt-( Term::lastTagInt() + 1); InputTreesIter iti = tree->subTrees().begin(); int len = tree->subTrees().size(); for( ; iti != tree->subTrees().end() ; iti++) { InputTree* stree = (*iti); if(len == 1) { const Term* strm = Term::get(stree->term()); if(strm->terminal_p()) continue; assert(strm); int chiInt = strm->toInt(); if(chiInt == parInt) continue; int rchiI = chiInt -( Term::lastTagInt() + 1); treeData_[rparI][rchiI]++; //cerr << "TD " << parInt<<" " << chiInt << " " << treeData_[rparI][rchiI] << endl; } gatherData(stree); } }
int tree_effEnd(TreeHist* treeh) { InputTree* tree = treeh->tree; int pos = tree->finish(); bool ans; if(pos > endPos) { cout << "Pos > endPos" << endl; ans = 0; } else if(pos == endPos) ans = 1; else { ECString wrd = sentence[pos]->word(); ECString trm = sentence[pos]->term(); if(trm == "." || wrd == ";") ans = 1; else if((pos+2) > endPos) ans = 0; else if(wrd == ",") { if(sentence[pos+1]->word() == "''") ans = 1; // ,'' acts like end of sentence; else ans = 0; //ans = 2 for alt version??? } else ans = 0; } return ans; }
int tree_term(TreeHist* treeh) { InputTree* tree = treeh->tree; const ECString& trmStr = tree->term(); const Term* trm = Term::get(trmStr); assert(trm); return trm->toInt(); }
InputTree* tree_parent_tree(TreeHist* treeh) { InputTree* tree = treeh->tree; InputTree* pt = tree->parent(); if(!pt) return NULL; if(pt->headTree() == tree->headTree()) return NULL; return pt; }
int tree_size(TreeHist* treeh) { static int bucs[9] = {1, 3, 6, 10, 15, 21, 28, 36, 999}; InputTree* tree = treeh->tree; int sz = tree->finish() - tree->start(); for(int i = 0 ; i < 9 ; i++) if(sz <= bucs[i]) return i; assert("Never get here"); return -1; }
int tree_term_after(TreeHist* treeh) { static int stopint = 0; if(!stopint) { ECString stopnm("STOP"); stopint = Term::get(stopnm)->toInt(); } InputTree* tree = treeh->tree; InputTree* par = tree->parent(); if(!par) return stopint; InputTreesIter iti = par->subTrees().begin(); for( ; iti != par->subTrees().end() ; iti++ ) { InputTree* st = *iti; if(st != tree) continue; iti++; if(iti == par->subTrees().end()) return stopint; st = *iti; const ECString& trmStr = st->term(); const Term* trm = Term::get(trmStr); assert(trm); return trm->toInt(); } error("Should never get here"); return -1; }
int tree_pos(TreeHist* treeh) { InputTree* tree = treeh->tree; const ECString& trmStr = tree->hTag(); const Term* trm = Term::get(trmStr); assert(trm); if(!trm->terminal_p()) { cerr << "Bad head Part of Speech: " << *trm << " in " <<endl; cerr << *tree << endl; assert(trm->terminal_p()); } return trm->toInt(); }
int tree_parent_term(TreeHist* treeh) { InputTree* tree = treeh->tree; static int s1int = 0; if(!s1int) { ECString s1nm("S1"); s1int = Term::get(s1nm)->toInt(); } InputTree* par = tree->parent(); if(!par) return s1int; const ECString& trmStr = par->term(); const Term* trm = Term::get(trmStr); assert(trm); assert(!trm->terminal_p()); return trm->toInt(); }
int tree_ngram(TreeHist* treeh, int n, int l) { static int stopTermInt = -1; if(stopTermInt < 0) { ECString stopStr("STOP"); const Term* stopTerm = Term::get(stopStr); stopTermInt = stopTerm->toInt(); } int pos = treeh->pos; int hp = treeh->hpos; int m = pos + (n * l); if(m < 0) return stopTermInt; InputTree* tree = treeh->tree; if(m >= tree->subTrees().size()) return stopTermInt; if(m > hp && l > 0) return stopTermInt; InputTree *subTree; InputTreesIter subTreeIter = tree->subTrees().begin(); int i = 0; for( ; subTreeIter != tree->subTrees().end() ; subTreeIter++ ) { if(i == m) { subTree = *subTreeIter; const Term* trm = Term::get(subTree->term()); return trm->toInt(); } i++; } assert("should never get here"); return -1; }
int tree_ccparent_term(TreeHist* treeh) { static int s1int = 0; if(!s1int) { ECString s1nm("S1"); s1int = Term::get(s1nm)->toInt(); } assert(treeh); InputTree* tree = treeh->tree; assert(tree); InputTree* par = tree->parent(); if(!par) return s1int; const ECString& trmStr = par->term(); const Term* trm = Term::get(trmStr); assert(trm); int trmInt = trm->toInt(); if(trmStr != tree->term()) return trmInt; //??? new; assert(!trm->terminal_p()); int ccedtrmInt = ccIndFromTree(par); return ccedtrmInt; }
InputTree* tree_2rel_tree(TreeHist* treeh) { //cerr << "t1r " << *treeh->tree << endl; int pos = treeh->pos; int hpos = treeh->hpos; if(pos == hpos || pos < hpos-1 || pos > hpos+1) return NULL; //cerr << "t2r " << *treeh->tree << endl; InputTree* sib; if(pos < hpos) { sib = tree_find(treeh, +1); int sibhp = headPosFromTree(sib); InputTree* sibch; if(sibhp > 0) { sibch = sib->subTrees().front(); } else if(sib->subTrees().size() < 2) return NULL; else { InputTreesIter iti = sib->subTrees().begin(); iti++; sibch = *iti; } return sibch; } else { sib = tree_find(treeh, -1); int sibhp = headPosFromTree(sib); InputTree* sibch; if(sibhp < sib->subTrees().size()-1) { sibch = sib->subTrees().back(); } else if(sib->subTrees().size() < 2) return NULL; else { InputTrees::reverse_iterator iti = sib->subTrees().rbegin(); iti++; sibch = *iti; } return sibch; } }
InputTree* tree_find(TreeHist* treeh, int n) { int pos = treeh->pos; int hp = treeh->hpos; int m = pos + n; assert(m >= 0); InputTree* tree = treeh->tree; assert(!(m >= tree->subTrees().size())); InputTree *subTree; InputTreesIter subTreeIter = tree->subTrees().begin(); int i = 0; for( ; subTreeIter != tree->subTrees().end() ; subTreeIter++ ) { if(i == m) { subTree = *subTreeIter; return subTree; } i++; } assert("should never get here"); return NULL; }
int tree_noopenQr(TreeHist* treeh) { InputTree* tree = treeh->tree; int pos = treeh->pos; int sz = tree->subTrees().size(); InputTree *subTree; InputTreesIter subTreeIter = tree->subTrees().begin(); int i = 0; bool sawOpen = false; for( ; ; subTreeIter++ ) { if(i == pos) break; subTree = *subTreeIter; assert(i < sz); const Term* trm = Term::get(subTree->term()); if(trm->isOpen() && !sawOpen) sawOpen=true; if(trm->isClosed() && sawOpen ) sawOpen = false; i++; } if(sawOpen) return 0; else return 1; }
/* the function called by each thread is "mainLoop" */ void* mainLoop(void* arg) { loopArg *loopA = (loopArg*)arg; istream* testSStream = loopA->inpt; ostream* pstatStream = loopA->outpt; int id = loopA->id; double log600 = log2(600.0); PrintStack printStack; for( ; ; ) { InputTree correct; InputTree* cuse; /* first lock to read in the material */ pthread_mutex_lock(&readlock); if( !*testSStream ) { pthread_mutex_unlock(&readlock); break; } *testSStream >> correct; if( !*testSStream ){ pthread_mutex_unlock(&readlock); break; } totWords += correct.length()+1; int locCount = sentenceCount++; list<ECString> wtList; correct.make(wtList); SentRep sr( wtList ); // used in precision calc ExtPos extPos; if(params.extPosIfstream) extPos.read(params.extPosIfstream,sr); pthread_mutex_unlock(&readlock); cuse = &correct; int len = correct.length(); if(len > params.maxSentLen) continue; //cerr << "Len = " << len << endl; /* if( !params.field().in(sentenceCount) ) { sentenceCount++; continue; } if(sentenceCount < -1) { sentenceCount++; continue; } sentenceCount++; */ vector<ECString> poslist; correct.makePosList(poslist); ScoreTree sc; sc.setEquivInts(poslist); MeChart* chart = new MeChart( sr,extPos,id ); chart->parse( ); Item* topS = chart->topS(); if(!topS) { cerr << "Parse failed" << endl; cerr << correct << endl; error(" could not parse "); delete chart; continue; } // compute the outside probabilities on the items so that we can // skip doing detailed computations on the really bad ones chart->set_Alphas(); Bst& bst = chart->findMapParse(); if( bst.empty()) error( "mapProbs did not return answer"); float bestF = -1; int i; int numVersions = 0; Link diffs(0); //cerr << "Need num diff: " << Bchart::Nth << endl; printStruct printS; printS.sentenceCount = locCount; printS.numDiff = 0; for(numVersions = 0 ; ; numVersions++) { short pos = 0; Val* val = bst.next(numVersions); if(!val) { //cerr << "Breaking" << endl; break; } InputTree* mapparse = inputTreeFromBsts(val,pos,sr); bool isU; int dummy = 0; diffs.is_unique(mapparse, isU, dummy); // cerr << "V " << isU << " " << numVersions << *mapparse << endl; if(isU) { printS.probs.push_back(val->prob()); printS.trees.push_back(mapparse); printS.numDiff++; } else { delete mapparse; } if(printS.numDiff >= Bchart::Nth) break; if(numVersions > 20000) break; } ParseStats* locPst = new ParseStats[Bchart::Nth]; ParseStats bestPs; for(i = 0 ; i <printS.numDiff ; i++) { InputTree *mapparse = printS.trees[i]; assert(mapparse); sc.trips.clear(); ParseStats pSt; sc.recordGold(cuse,pSt); sc.precisionRecall(mapparse,pSt); float newF = pSt.fMeasure(); cerr << printS.sentenceCount << "\t" << newF << endl; if(newF > bestF) { bestF = newF; bestPs = pSt; } if(histPoints[i]) { locPst[i] += bestPs; } } if(printS.numDiff < Bchart::Nth) { for(i = printS.numDiff ; i < Bchart::Nth ; i++) { if(histPoints[i]) locPst[i] += bestPs; } } pthread_mutex_lock(&scorelock); for(i = 0 ; i < Bchart::Nth ; i++) totPst[i]+=locPst[i]; pthread_mutex_unlock(&scorelock); int numPrinted; /* put the sentence with which we just finished at the end of the printStack*/ printStack.push_back(printS); PrintStack::iterator psi = printStack.begin(); /* now look at each item from the front of the print stack to see if it should be printed now */ pthread_mutex_lock(&writelock); for( numPrinted =0; psi != printStack.end(); numPrinted++ ) { printStruct& pstr=(*psi); if(pstr.sentenceCount != printCount) break; *pstatStream << pstr.sentenceCount << "\t" << pstr.numDiff << "\n"; printCount++; for(i = 0 ; i < pstr.numDiff ; i++) { InputTree* mapparse = pstr.trees[i]; assert(mapparse); double logP =log2(pstr.probs[i]); logP -= (sr.length()*log600); *pstatStream << logP << "\n"; if(Bchart::prettyPrint) *pstatStream << *mapparse << "\n\n"; else { mapparse->printproper(*pstatStream); *pstatStream << "\n"; } delete mapparse; } *pstatStream << endl; psi++; } pthread_mutex_unlock(&writelock); for(i = 0 ; i < numPrinted ; i++) printStack.pop_front(); if(Feature::isLM) { double lgram = log2(bst.sum()); lgram -= (sr.length()*log600); double pgram = pow(2,lgram); double iptri = chart->triGram();; double ltri = (log2(iptri)-sr.length()*log600); double ptri = pow(2.0,ltri); double pcomb1 = (0.667 * pgram)+(0.333 * ptri); double lcom1 = log2(pcomb1); totGram -= lgram; totTri -= ltri; totMix -= lcom1; if(locCount%10 == 9) { cerr << locCount << "\t"; cerr << pow(2.0,totGram/(double)totWords); cerr <<"\t" << pow(2.0,totTri/(double)totWords); cerr << "\t" << pow(2.0,totMix/(double)(totWords)); cerr << endl; } } if(locCount%50 == 1) { cerr << sentenceCount << "\t"; for(int i = 0 ; i < Bchart::Nth ; i++) if(histPoints[i]) { cerr << i << " " << totPst[i].fMeasure() << "\t"; } cerr << endl; } delete chart; delete [] locPst; } return 0; }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); assert(args.nargs() == 1); ECString path(args.arg(0)); cerr << "At start of pHsgt" << endl; for(int n = 0 ; n < MAXNUMNTS ; n++) numTerm[n] = 0; Term::init( path ); readHeadInfo(path); int sentenceCount = 0; ECString s1lex("^^"); ECString s1nm("S1"); int s1Int = Term::get(s1nm)->toInt(); UnitRules ur; ur.init(); while(cin) { //if(sentenceCount > 4000) break; if(sentenceCount%10000 == 0) cerr << sentenceCount << endl; InputTree parse; cin >> parse; //cerr << parse << endl; if(!cin) break; if(parse.length() == 0) break; EcSPairs wtList; parse.make(wtList); InputTree* par; par = &parse; addWwData(par); incrWordData(s1Int, s1lex); ur.gatherData(par); sentenceCount++; } ECString resultsString(path); resultsString += "pSgT.txt"; ofstream resultsStream(resultsString.c_str()); assert(resultsStream); int numWords = 0; resultsStream << " \n"; //leave space for number of words; resultsStream.precision(3); ECString lastWord; int wordFreq = 0; WordMap::iterator wmi = wordMap.begin(); resultsStream << wordMap.size() << "\n\n"; for( ; wmi != wordMap.end() ; wmi++) { ECString w = (*wmi).first; resultsStream << w << "\t"; PosD& posd = (*wmi).second; PosD::iterator pdi = posd.begin(); int count = 0; for( ; pdi != posd.end(); pdi++) { int posInt = (*pdi).first; int c = (*pdi).second; count += c; float p = (float)c/(float)numTerm[posInt]; resultsStream << posInt << " " << p << " "; } resultsStream << "| " << count << "\n"; } ur.setData(path); return 1; }
int tree_B(TreeHist* treeh, int blInd) { InputTree* tree = treeh->tree; int i; int pos = treeh->pos; int hpos = treeh->hpos; //cerr << "tb1 " << pos << " " << hpos << " " << *tree << endl; int sz = tree->subTrees().size(); int wpos; assert(pos <= sz); //cerr << "tb " << pos << " " << hpos << " " << sz << endl; if(pos < 0) wpos = tree->start()-1; else if(sz == 0) wpos = tree->start()-1; else if(pos == sz) wpos = tree->finish(); else { InputTreesIter iti = tree->subTrees().begin(); i = 0; for( ; iti != tree->subTrees().end() ; iti++) { if(i < pos) { i++; continue; } InputTree* st = *iti; if(pos < hpos) wpos = st->start()-1; else if(pos > hpos) wpos = st->finish(); else if(blInd) wpos = st->start()-1; else wpos = st->finish(); //cerr << "tbf " << *st << " " << wpos << endl; break; } } //cerr << "tb2 " << wpos << endl; assert(wpos <= endPos); if(wpos < 0 || wpos == endPos) return Term::stopTerm->toInt(); else return Term::get(sentence[wpos]->term())->toInt(); }
int main(int argc, char *argv[]) { struct rlimit core_limits; core_limits.rlim_cur = 0; core_limits.rlim_max = 0; setrlimit( RLIMIT_CORE, &core_limits ); ECArgs args( argc, argv ); assert(args.nargs() == 2); conditionedType = args.arg(0); cerr << "start trainRs: " << conditionedType << endl; ECString path( args.arg( 1 ) ); if(args.isset('L')) Feature::setLM(); Term::init(path); readHeadInfo(path); Pst pst(path); if(Feature::isLM) ClassRule::readCRules(path); addSubFeatureFns(); Feature::init(path, conditionedType); whichInt = Feature::whichInt; int ceFunInt = Feature::conditionedFeatureInt[Feature::whichInt]; Feature::conditionedEvent = SubFeature::Funs[ceFunInt]; Feat::Usage = PARSE; ECString ftstr(path); ftstr += conditionedType; ftstr += ".g"; ifstream fts(ftstr.c_str()); if(!fts) { cerr << "Could not find " << ftstr << endl; assert(fts); } tRoot = new FeatureTree(fts); //puts it in root; cout.precision(3); cerr.precision(3); lamInit(); InputTree* trainingData[1001]; int usedCount = 0; sentenceCount = 0; for( ; ; sentenceCount++) { if(sentenceCount%10000 == 1) { // cerr << conditionedType << ".tr " //<< sentenceCount << endl; } if(usedCount >= 1000) break; InputTree* correct = new InputTree; cin >> (*correct); if(correct->length() == 0) break; if(!cin) break; EcSPairs wtList; correct->make(wtList); InputTree* par; par = correct; trainingData[usedCount++] = par; } if(Feature::isLM) pickLogBases(trainingData,sentenceCount); procGSwitch = true; for(pass = 0 ; pass < 10 ; pass++) { if(pass%2 == 1) cout << "Pass " << pass << endl; goThroughSents(trainingData, sentenceCount); updateLambdas(); //printLambdas(cout); zeroData(); } ECString resS(path); resS += conditionedType; resS += ".lambdas"; ofstream res(resS.c_str()); res.precision(3); printLambdas(res); printLambdas(cout); cout << "Total params = " << FeatureTree::totParams << endl; cout << "Done: " << (int)sbrk(0) << endl; }
int main(int argc, char *argv[]) { struct rlimit core_limits; core_limits.rlim_cur = 0; core_limits.rlim_max = 0; setrlimit( RLIMIT_CORE, &core_limits ); ECArgs args( argc, argv ); assert(args.nargs() == 2); if(args.isset('N')) numGram = atoi(args.value('N').c_str()); Feature::setLM(); if(args.isset('L')) Term::Language = args.value('L'); string path( args.arg( 1 ) ); if(Term::Language == "Ch") readHeadInfoCh(path); else readHeadInfo(path); string conditionedType( args.arg(0) ); cerr << "start kn3Counts " << conditionedType << endl; int minCount = 1; if(args.isset('m')) minCount = atoi(args.value('m').c_str()); Feat::Usage = KNCOUNTS; FeatureTree::minCount = minCount; Term::init(path); readHeadInfo(path); Pst pst(path); addSubFeatureFns(); Feature::assignCalc(conditionedType); FeatureTree::root() = new FeatureTree(); Feature::init(path, conditionedType); int wI = Feature::whichInt; int ceFunInt = Feature::conditionedFeatureInt[wI]; Feature::conditionedEvent = SubFeature::Funs[ceFunInt]; string trainingString( path ); int sentenceCount = 0; for( ; ; sentenceCount++) { if(sentenceCount%10000 == 1) { cerr << "rCounts " << sentenceCount << endl; } InputTree correct; cin >> correct; //if(sentenceCount > 1000) break; if(correct.length() == 0) break; //cerr <<sentenceCount << correct << endl; EcSPairs wtList; correct.make(wtList); InputTree* par; int strt = 0; par = &correct; makeSent(par); curS = par; gatherFfCounts(par, 0); if(wI == TTCALC || wI == WWCALC) { list<InputTree*> dummy2; InputTree stopInputTree(par->finish(),par->finish(), wI==TTCALC ? "" : "^^", "STOP","", dummy2,NULL,NULL); stopInputTree.headTree() = &stopInputTree; TreeHist treeh(&stopInputTree,0); treeh.hpos = 0; callProcG(&treeh); } } finalProbComputation(); string resS(path); resS += conditionedType; resS += ".g"; ofstream res(resS.c_str()); assert(res); FTreeMap& fts = FeatureTree::root()->subtree; FTreeMap::iterator fti = fts.begin(); for( ; fti != fts.end() ; fti++) { int asVal = (*fti).first; (*fti).second->printFTree(asVal, res); } res.close(); cout << "Tot words: " << totWords << endl; cout << "Total params for " << conditionedType << " = " << FeatureTree::totParams << endl; }
Item* Bchart:: edgesFromTree(InputTree* tree) { int b, b0; b0 = tree->num(); const Term* trm = Term::get(tree->term()); assert(trm); //cerr << "ARI " << *trm << " " << b0 << endl; if(printDebug() > 1005) cerr << "EFIE " << trm->name() << " " << b0 << endl; /* If this is a terminal node, the rhs will be a word; otherwise it will be a rule expansion consisting of several Item s. */ if(trm->terminal_p()) { ECString tmpW1 = tree->word(); char chars[512]; ECString tmpW = toLower(tmpW1.c_str(), chars); int wInt = wtoInt(tmpW); Item* lhs = add_item(b0, trm, tree->start()); lhs->start() = tree->start(); lhs->finish() = tree->finish(); Item* rhs = add_item2(b0, trm, wInt,tmpW); rhs->finish() = tree->finish(); rhs->start() = tree->start(); if(!lhs && !rhs) { return NULL; } Items subItems; subItems.push_back(stops[tree->start()]); subItems.push_back(rhs); subItems.push_back(stops[tree->finish()]); Edge* edg = add_edge(lhs, subItems); if(!edg) { return NULL; } edg->prob() = pHst(wInt,trm->toInt()); edg->num() = b0; if(printDebug() > 5) cerr << "LHS " << *lhs << " " << tmpW << edg->prob() << endl; return lhs; } else { Item* lhs = add_item(b0, trm, -1); lhs->start() = tree->start(); lhs->finish() = tree->finish(); assert(lhs); Items subItems; subItems.push_back(stops[tree->start()]); InputTreesIter iti = tree->subTrees().begin(); for( ; iti != tree->subTrees().end() ; iti++) { InputTree* stree = (*iti); cerr << "WBA "<< stree->term() << *stree << endl; Item* itm = edgesFromTree(stree); if(!itm) { return NULL; } subItems.push_back(itm); } subItems.push_back(stops[tree->finish()]); Edge* edg = add_edge(lhs, subItems); if(!edg) { return false; } edg->num() = b0; assignRProb(edg); if (printDebug() > 5) { cerr << "Saw edge " << *edg << ": p=" << edg->prob() << endl; } //cerr << "endeFE " << *edg << endl; return lhs; rPendFactor(); } }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); ECString path(args.arg(0)); cerr << "At start of pUgT" << endl; Term::init( path ); if(args.isset('L')) Term::Language = args.value('L'); readHeadInfo(path); Pst pst(path); int sentenceCount = 0; int i, j; for(i = 0 ; i < MAXNUMTS ; i++) { posCounts[i] = 0; posCapCounts[i] = 0; posDenoms[i] = 0; posUCounts[i] = 0; posDashCounts[i] = 0; } for(i = 0 ; i < MAXNUMTS ; i++) totCounts[i] = 0; i = 0; for( ; ; ) { if(i++%10000 == 1) cerr << i << endl; //if(i > 1000) break; InputTree parse; cin >> parse; //cerr << parse << endl; if(parse.length() == 0) break; if(!cin) break; curSent = &parse; addWwData(&parse); sentenceCount++; } ECString resultsString(path); resultsString += "pUgT.txt"; ofstream resultsStream(resultsString.c_str()); assert(resultsStream); /* we print out p(unknown|tag) p(Capital|tag) p(hasDash|tag, unknown) note for Capital the denom is different because we ignore the first two words of the sentence */ int nm = Term::lastTagInt()+1; for(i = 0 ; i < nm ; i++) { resultsStream << i << "\t"; float pugt = 0; float pudenom = (float)posDenoms[i]; if(pudenom > 0) pugt = (float)posUCounts[i]/pudenom; resultsStream << pugt << "\t"; if(posCounts[i] == 0) resultsStream << 0 << "\t"; else resultsStream << (float) posCapCounts[i]/ (float)posCounts[i] << "\t"; if(posUCounts[i] == 0) resultsStream << 0; else resultsStream << (float)posDashCounts[i]/posUCounts[i] ; resultsStream << endl; } ECString resultsString2(path); resultsString2 += "nttCounts.txt"; ofstream resultsStream2(resultsString2.c_str()); assert(resultsStream2); for(i = 0 ; i <= Term::lastNTInt() ; i++) { resultsStream2 << i << "\t"; resultsStream2 << totCounts[i] << "\n"; } return 0; }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); assert(args.nargs() == 1); ECString path(args.arg(0)); cerr << "At start of pSfgt" << endl; for(int n = 0 ; n < 140 ; n++) numTerm[n] = 0; ECString resultsString(path); resultsString += "endings.txt"; Term::init( path ); if(args.isset('L')) Term::Language = args.value('L'); readHeadInfo(path); Pst pst(path); //???; int sentenceCount = 0; int wordCount = 0; int processedCount = 0; /*int i, j; for(i = 0 ; i < 60 ; i++) for(j = 0 ; j < 30 ; j++) data[i][j] = 0; */ int i = 0; while(cin) { if(i++%5000 == 1) cerr << i << endl; InputTree parse; cin >> parse; if(!cin) break; if(parse.length() == 0 && cin) continue; if(parse.length()==0 ||!cin) break; addWwData(&parse); processedCount++; wordCount += parse.length(); } ofstream resultsStream(resultsString.c_str()); assert(resultsStream); /*int totNt[30]; for(i = 0 ; i < 30 ; i++) totNt[i] = 0; for(i = 0 ; i <= Term::lastTagInt() ; i++) { for(j = 0 ; j < (Term::lastNTInt() - Term::lastTagInt()) ; j++) totNt[j] += data[i][j]; } */ resultsStream << numEndings << "\n"; for(i = 0 ; i < 140 ; i++) { endMap::iterator emi = endData[i].begin(); for( ; emi != endData[i].end() ; emi++) { ECString ending = (*emi).first; int cnt = (*emi).second; resultsStream << i << "\t" << ending << "\t" << (float) cnt / (float) numTerm[i] << endl; //<< "\n"; } } cout<<"totol sentence:"<<processedCount<<endl; cout<<"total suffix:"<<numEndings<<endl; return 0; }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); assert(args.nargs() == 1); ECString path(args.arg(0)); cerr << "At start of pTgNt" << endl; for(int n = 0 ; n < MAXNUMTS ; n++) numTerm[n] = 0; ECString resultsString(path); resultsString += "endings.txt"; Term::init( path ); if(args.isset('L')) Term::Language = args.value('L'); readHeadInfo(path); Pst pst(path); int sentenceCount = 0; int wordCount = 0; int processedCount = 0; int i, j; for(i = 0 ; i < MAXNUMTS ; i++) for(j = 0 ; j < MAXNUMNTS ; j++) data[i][j] = 0; i = 0; while(cin) { if(i%10000 == 0) cerr << i << endl; //if(i > 1000) break; InputTree parse; cin >> parse; if(!cin) break; if(parse.length() == 0) break; const Term* resTerm = addWwData(&parse); processedCount++; wordCount += parse.length(); i++; } ofstream resultsStream(resultsString.c_str()); assert(resultsStream); int totNt[MAXNUMTS]; for(i = 0 ; i < MAXNUMTS ; i++) totNt[i] = 0; for(i = 0 ; i <= Term::lastTagInt() ; i++) { for(j = 0 ; j < (Term::lastNTInt() - Term::lastTagInt()) ; j++) totNt[j] += data[i][j]; } resultsStream << numEndings << "\n"; for(i = 0 ; i < MAXNUMTS ; i++) { endMap::iterator emi = endData[i].begin(); for( ; emi != endData[i].end() ; emi++) { ECString ending = (*emi).first; int cnt = (*emi).second; resultsStream << i << "\t" << ending << "\t" << (float) cnt / (float) numTerm[i] << endl; //<< "\n"; } } return 0; }