int main(int argc, char *argv[]) { ECArgs args(argc, argv); params.init(args); ECString path(args.arg(0)); generalInit(path); // we don't use sentenceCount since it treeLogProb may parse the // sentence multiple times int index = 0; while (true) { if (!cin) { break; } InputTree correct; cin >> correct; int len = correct.length(); if (len == 0) { break; } if (len > params.maxSentLen) { continue; } double logProb; try { logProb = treeLogProb(&correct); } catch (ParserError) { logProb = 0; } cout << index << "\t" << logProb << endl; index++; } return 0; }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); assert(args.nargs() == 1); ECString path(args.arg(0)); cerr << "At start of pHsgt" << endl; for(int n = 0 ; n < MAXNUMNTS ; n++) numTerm[n] = 0; Term::init( path ); readHeadInfo(path); int sentenceCount = 0; ECString s1lex("^^"); ECString s1nm("S1"); int s1Int = Term::get(s1nm)->toInt(); UnitRules ur; ur.init(); while(cin) { //if(sentenceCount > 4000) break; if(sentenceCount%10000 == 0) cerr << sentenceCount << endl; InputTree parse; cin >> parse; //cerr << parse << endl; if(!cin) break; if(parse.length() == 0) break; EcSPairs wtList; parse.make(wtList); InputTree* par; par = &parse; addWwData(par); incrWordData(s1Int, s1lex); ur.gatherData(par); sentenceCount++; } ECString resultsString(path); resultsString += "pSgT.txt"; ofstream resultsStream(resultsString.c_str()); assert(resultsStream); int numWords = 0; resultsStream << " \n"; //leave space for number of words; resultsStream.precision(3); ECString lastWord; int wordFreq = 0; WordMap::iterator wmi = wordMap.begin(); resultsStream << wordMap.size() << "\n\n"; for( ; wmi != wordMap.end() ; wmi++) { ECString w = (*wmi).first; resultsStream << w << "\t"; PosD& posd = (*wmi).second; PosD::iterator pdi = posd.begin(); int count = 0; for( ; pdi != posd.end(); pdi++) { int posInt = (*pdi).first; int c = (*pdi).second; count += c; float p = (float)c/(float)numTerm[posInt]; resultsStream << posInt << " " << p << " "; } resultsStream << "| " << count << "\n"; } ur.setData(path); return 1; }
int main(int argc, char *argv[]) { struct rlimit core_limits; core_limits.rlim_cur = 0; core_limits.rlim_max = 0; setrlimit( RLIMIT_CORE, &core_limits ); ECArgs args( argc, argv ); assert(args.nargs() == 2); conditionedType = args.arg(0); cerr << "start trainRs: " << conditionedType << endl; ECString path( args.arg( 1 ) ); if(args.isset('L')) Feature::setLM(); Term::init(path); readHeadInfo(path); Pst pst(path); if(Feature::isLM) ClassRule::readCRules(path); addSubFeatureFns(); Feature::init(path, conditionedType); whichInt = Feature::whichInt; int ceFunInt = Feature::conditionedFeatureInt[Feature::whichInt]; Feature::conditionedEvent = SubFeature::Funs[ceFunInt]; Feat::Usage = PARSE; ECString ftstr(path); ftstr += conditionedType; ftstr += ".g"; ifstream fts(ftstr.c_str()); if(!fts) { cerr << "Could not find " << ftstr << endl; assert(fts); } tRoot = new FeatureTree(fts); //puts it in root; cout.precision(3); cerr.precision(3); lamInit(); InputTree* trainingData[1001]; int usedCount = 0; sentenceCount = 0; for( ; ; sentenceCount++) { if(sentenceCount%10000 == 1) { // cerr << conditionedType << ".tr " //<< sentenceCount << endl; } if(usedCount >= 1000) break; InputTree* correct = new InputTree; cin >> (*correct); if(correct->length() == 0) break; if(!cin) break; EcSPairs wtList; correct->make(wtList); InputTree* par; par = correct; trainingData[usedCount++] = par; } if(Feature::isLM) pickLogBases(trainingData,sentenceCount); procGSwitch = true; for(pass = 0 ; pass < 10 ; pass++) { if(pass%2 == 1) cout << "Pass " << pass << endl; goThroughSents(trainingData, sentenceCount); updateLambdas(); //printLambdas(cout); zeroData(); } ECString resS(path); resS += conditionedType; resS += ".lambdas"; ofstream res(resS.c_str()); res.precision(3); printLambdas(res); printLambdas(cout); cout << "Total params = " << FeatureTree::totParams << endl; cout << "Done: " << (int)sbrk(0) << endl; }
int main(int argc, char *argv[]) { struct rlimit core_limits; core_limits.rlim_cur = 0; core_limits.rlim_max = 0; setrlimit( RLIMIT_CORE, &core_limits ); ECArgs args( argc, argv ); assert(args.nargs() == 2); if(args.isset('N')) numGram = atoi(args.value('N').c_str()); Feature::setLM(); if(args.isset('L')) Term::Language = args.value('L'); string path( args.arg( 1 ) ); if(Term::Language == "Ch") readHeadInfoCh(path); else readHeadInfo(path); string conditionedType( args.arg(0) ); cerr << "start kn3Counts " << conditionedType << endl; int minCount = 1; if(args.isset('m')) minCount = atoi(args.value('m').c_str()); Feat::Usage = KNCOUNTS; FeatureTree::minCount = minCount; Term::init(path); readHeadInfo(path); Pst pst(path); addSubFeatureFns(); Feature::assignCalc(conditionedType); FeatureTree::root() = new FeatureTree(); Feature::init(path, conditionedType); int wI = Feature::whichInt; int ceFunInt = Feature::conditionedFeatureInt[wI]; Feature::conditionedEvent = SubFeature::Funs[ceFunInt]; string trainingString( path ); int sentenceCount = 0; for( ; ; sentenceCount++) { if(sentenceCount%10000 == 1) { cerr << "rCounts " << sentenceCount << endl; } InputTree correct; cin >> correct; //if(sentenceCount > 1000) break; if(correct.length() == 0) break; //cerr <<sentenceCount << correct << endl; EcSPairs wtList; correct.make(wtList); InputTree* par; int strt = 0; par = &correct; makeSent(par); curS = par; gatherFfCounts(par, 0); if(wI == TTCALC || wI == WWCALC) { list<InputTree*> dummy2; InputTree stopInputTree(par->finish(),par->finish(), wI==TTCALC ? "" : "^^", "STOP","", dummy2,NULL,NULL); stopInputTree.headTree() = &stopInputTree; TreeHist treeh(&stopInputTree,0); treeh.hpos = 0; callProcG(&treeh); } } finalProbComputation(); string resS(path); resS += conditionedType; resS += ".g"; ofstream res(resS.c_str()); assert(res); FTreeMap& fts = FeatureTree::root()->subtree; FTreeMap::iterator fti = fts.begin(); for( ; fti != fts.end() ; fti++) { int asVal = (*fti).first; (*fti).second->printFTree(asVal, res); } res.close(); cout << "Tot words: " << totWords << endl; cout << "Total params for " << conditionedType << " = " << FeatureTree::totParams << endl; }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); assert(args.nargs() == 1); ECString path(args.arg(0)); cerr << "At start of pSfgt" << endl; for(int n = 0 ; n < 140 ; n++) numTerm[n] = 0; ECString resultsString(path); resultsString += "endings.txt"; Term::init( path ); if(args.isset('L')) Term::Language = args.value('L'); readHeadInfo(path); Pst pst(path); //???; int sentenceCount = 0; int wordCount = 0; int processedCount = 0; /*int i, j; for(i = 0 ; i < 60 ; i++) for(j = 0 ; j < 30 ; j++) data[i][j] = 0; */ int i = 0; while(cin) { if(i++%5000 == 1) cerr << i << endl; InputTree parse; cin >> parse; if(!cin) break; if(parse.length() == 0 && cin) continue; if(parse.length()==0 ||!cin) break; addWwData(&parse); processedCount++; wordCount += parse.length(); } ofstream resultsStream(resultsString.c_str()); assert(resultsStream); /*int totNt[30]; for(i = 0 ; i < 30 ; i++) totNt[i] = 0; for(i = 0 ; i <= Term::lastTagInt() ; i++) { for(j = 0 ; j < (Term::lastNTInt() - Term::lastTagInt()) ; j++) totNt[j] += data[i][j]; } */ resultsStream << numEndings << "\n"; for(i = 0 ; i < 140 ; i++) { endMap::iterator emi = endData[i].begin(); for( ; emi != endData[i].end() ; emi++) { ECString ending = (*emi).first; int cnt = (*emi).second; resultsStream << i << "\t" << ending << "\t" << (float) cnt / (float) numTerm[i] << endl; //<< "\n"; } } cout<<"totol sentence:"<<processedCount<<endl; cout<<"total suffix:"<<numEndings<<endl; return 0; }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); assert(args.nargs() == 1); ECString path(args.arg(0)); cerr << "At start of pTgNt" << endl; for(int n = 0 ; n < MAXNUMTS ; n++) numTerm[n] = 0; ECString resultsString(path); resultsString += "endings.txt"; Term::init( path ); if(args.isset('L')) Term::Language = args.value('L'); readHeadInfo(path); Pst pst(path); int sentenceCount = 0; int wordCount = 0; int processedCount = 0; int i, j; for(i = 0 ; i < MAXNUMTS ; i++) for(j = 0 ; j < MAXNUMNTS ; j++) data[i][j] = 0; i = 0; while(cin) { if(i%10000 == 0) cerr << i << endl; //if(i > 1000) break; InputTree parse; cin >> parse; if(!cin) break; if(parse.length() == 0) break; const Term* resTerm = addWwData(&parse); processedCount++; wordCount += parse.length(); i++; } ofstream resultsStream(resultsString.c_str()); assert(resultsStream); int totNt[MAXNUMTS]; for(i = 0 ; i < MAXNUMTS ; i++) totNt[i] = 0; for(i = 0 ; i <= Term::lastTagInt() ; i++) { for(j = 0 ; j < (Term::lastNTInt() - Term::lastTagInt()) ; j++) totNt[j] += data[i][j]; } resultsStream << numEndings << "\n"; for(i = 0 ; i < MAXNUMTS ; i++) { endMap::iterator emi = endData[i].begin(); for( ; emi != endData[i].end() ; emi++) { ECString ending = (*emi).first; int cnt = (*emi).second; resultsStream << i << "\t" << ending << "\t" << (float) cnt / (float) numTerm[i] << endl; //<< "\n"; } } return 0; }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); ECString path(args.arg(0)); cerr << "At start of pUgT" << endl; Term::init( path ); if(args.isset('L')) Term::Language = args.value('L'); readHeadInfo(path); Pst pst(path); int sentenceCount = 0; int i, j; for(i = 0 ; i < MAXNUMTS ; i++) { posCounts[i] = 0; posCapCounts[i] = 0; posDenoms[i] = 0; posUCounts[i] = 0; posDashCounts[i] = 0; } for(i = 0 ; i < MAXNUMTS ; i++) totCounts[i] = 0; i = 0; for( ; ; ) { if(i++%10000 == 1) cerr << i << endl; //if(i > 1000) break; InputTree parse; cin >> parse; //cerr << parse << endl; if(parse.length() == 0) break; if(!cin) break; curSent = &parse; addWwData(&parse); sentenceCount++; } ECString resultsString(path); resultsString += "pUgT.txt"; ofstream resultsStream(resultsString.c_str()); assert(resultsStream); /* we print out p(unknown|tag) p(Capital|tag) p(hasDash|tag, unknown) note for Capital the denom is different because we ignore the first two words of the sentence */ int nm = Term::lastTagInt()+1; for(i = 0 ; i < nm ; i++) { resultsStream << i << "\t"; float pugt = 0; float pudenom = (float)posDenoms[i]; if(pudenom > 0) pugt = (float)posUCounts[i]/pudenom; resultsStream << pugt << "\t"; if(posCounts[i] == 0) resultsStream << 0 << "\t"; else resultsStream << (float) posCapCounts[i]/ (float)posCounts[i] << "\t"; if(posUCounts[i] == 0) resultsStream << 0; else resultsStream << (float)posDashCounts[i]/posUCounts[i] ; resultsStream << endl; } ECString resultsString2(path); resultsString2 += "nttCounts.txt"; ofstream resultsStream2(resultsString2.c_str()); assert(resultsStream2); for(i = 0 ; i <= Term::lastNTInt() ; i++) { resultsStream2 << i << "\t"; resultsStream2 << totCounts[i] << "\n"; } return 0; }
/* the function called by each thread is "mainLoop" */ void* mainLoop(void* arg) { loopArg *loopA = (loopArg*)arg; istream* testSStream = loopA->inpt; ostream* pstatStream = loopA->outpt; int id = loopA->id; double log600 = log2(600.0); PrintStack printStack; for( ; ; ) { InputTree correct; InputTree* cuse; /* first lock to read in the material */ pthread_mutex_lock(&readlock); if( !*testSStream ) { pthread_mutex_unlock(&readlock); break; } *testSStream >> correct; if( !*testSStream ){ pthread_mutex_unlock(&readlock); break; } totWords += correct.length()+1; int locCount = sentenceCount++; list<ECString> wtList; correct.make(wtList); SentRep sr( wtList ); // used in precision calc ExtPos extPos; if(params.extPosIfstream) extPos.read(params.extPosIfstream,sr); pthread_mutex_unlock(&readlock); cuse = &correct; int len = correct.length(); if(len > params.maxSentLen) continue; //cerr << "Len = " << len << endl; /* if( !params.field().in(sentenceCount) ) { sentenceCount++; continue; } if(sentenceCount < -1) { sentenceCount++; continue; } sentenceCount++; */ vector<ECString> poslist; correct.makePosList(poslist); ScoreTree sc; sc.setEquivInts(poslist); MeChart* chart = new MeChart( sr,extPos,id ); chart->parse( ); Item* topS = chart->topS(); if(!topS) { cerr << "Parse failed" << endl; cerr << correct << endl; error(" could not parse "); delete chart; continue; } // compute the outside probabilities on the items so that we can // skip doing detailed computations on the really bad ones chart->set_Alphas(); Bst& bst = chart->findMapParse(); if( bst.empty()) error( "mapProbs did not return answer"); float bestF = -1; int i; int numVersions = 0; Link diffs(0); //cerr << "Need num diff: " << Bchart::Nth << endl; printStruct printS; printS.sentenceCount = locCount; printS.numDiff = 0; for(numVersions = 0 ; ; numVersions++) { short pos = 0; Val* val = bst.next(numVersions); if(!val) { //cerr << "Breaking" << endl; break; } InputTree* mapparse = inputTreeFromBsts(val,pos,sr); bool isU; int dummy = 0; diffs.is_unique(mapparse, isU, dummy); // cerr << "V " << isU << " " << numVersions << *mapparse << endl; if(isU) { printS.probs.push_back(val->prob()); printS.trees.push_back(mapparse); printS.numDiff++; } else { delete mapparse; } if(printS.numDiff >= Bchart::Nth) break; if(numVersions > 20000) break; } ParseStats* locPst = new ParseStats[Bchart::Nth]; ParseStats bestPs; for(i = 0 ; i <printS.numDiff ; i++) { InputTree *mapparse = printS.trees[i]; assert(mapparse); sc.trips.clear(); ParseStats pSt; sc.recordGold(cuse,pSt); sc.precisionRecall(mapparse,pSt); float newF = pSt.fMeasure(); cerr << printS.sentenceCount << "\t" << newF << endl; if(newF > bestF) { bestF = newF; bestPs = pSt; } if(histPoints[i]) { locPst[i] += bestPs; } } if(printS.numDiff < Bchart::Nth) { for(i = printS.numDiff ; i < Bchart::Nth ; i++) { if(histPoints[i]) locPst[i] += bestPs; } } pthread_mutex_lock(&scorelock); for(i = 0 ; i < Bchart::Nth ; i++) totPst[i]+=locPst[i]; pthread_mutex_unlock(&scorelock); int numPrinted; /* put the sentence with which we just finished at the end of the printStack*/ printStack.push_back(printS); PrintStack::iterator psi = printStack.begin(); /* now look at each item from the front of the print stack to see if it should be printed now */ pthread_mutex_lock(&writelock); for( numPrinted =0; psi != printStack.end(); numPrinted++ ) { printStruct& pstr=(*psi); if(pstr.sentenceCount != printCount) break; *pstatStream << pstr.sentenceCount << "\t" << pstr.numDiff << "\n"; printCount++; for(i = 0 ; i < pstr.numDiff ; i++) { InputTree* mapparse = pstr.trees[i]; assert(mapparse); double logP =log2(pstr.probs[i]); logP -= (sr.length()*log600); *pstatStream << logP << "\n"; if(Bchart::prettyPrint) *pstatStream << *mapparse << "\n\n"; else { mapparse->printproper(*pstatStream); *pstatStream << "\n"; } delete mapparse; } *pstatStream << endl; psi++; } pthread_mutex_unlock(&writelock); for(i = 0 ; i < numPrinted ; i++) printStack.pop_front(); if(Feature::isLM) { double lgram = log2(bst.sum()); lgram -= (sr.length()*log600); double pgram = pow(2,lgram); double iptri = chart->triGram();; double ltri = (log2(iptri)-sr.length()*log600); double ptri = pow(2.0,ltri); double pcomb1 = (0.667 * pgram)+(0.333 * ptri); double lcom1 = log2(pcomb1); totGram -= lgram; totTri -= ltri; totMix -= lcom1; if(locCount%10 == 9) { cerr << locCount << "\t"; cerr << pow(2.0,totGram/(double)totWords); cerr <<"\t" << pow(2.0,totTri/(double)totWords); cerr << "\t" << pow(2.0,totMix/(double)(totWords)); cerr << endl; } } if(locCount%50 == 1) { cerr << sentenceCount << "\t"; for(int i = 0 ; i < Bchart::Nth ; i++) if(histPoints[i]) { cerr << i << " " << totPst[i].fMeasure() << "\t"; } cerr << endl; } delete chart; delete [] locPst; } return 0; }