void addWwData(InputTree* tree) { ECString wTagNm = tree->term(); const Term* trm = Term::get(wTagNm); int lhsInt = trm->toInt(); totCounts[lhsInt]++; if( tree->word() != "" ) { ECString hdLexU(tree->word()); char temp[512]; ECString hdLex(langAwareToLower(hdLexU.c_str(),temp)); int len = hdLex.length(); const WordInfo* wi = Pst::get(hdLex); //???; if (!wi) cerr << "Couldn't find entry for word '" << hdLex << "' in pSgT.txt" << endl; assert(wi); /* Ignore words very close to start of sentence, those that are of length 1, and those who's capitalization is ambiguous. */ if(tree->start() >= 2 && len > 1 &&!(hdLex[0] != hdLexU[0] && hdLex[1] != hdLexU[1])) { posCounts[lhsInt]++; if(hdLex[0] != hdLexU[0] && hdLex[1] == hdLexU[1]) { posCapCounts[lhsInt]++; } } posDenoms[lhsInt]++; if(wi->c() <= 2) { posUCounts[lhsInt]++; const char* hyppos = strpbrk(hdLex.c_str(), "-"); if(hyppos) posDashCounts[lhsInt]++; } return; } InputTrees& st = tree->subTrees(); InputTrees::iterator subTreeIter= st.begin(); InputTree *subTree; for( ; subTreeIter != st.end() ; subTreeIter++ ) { subTree = *subTreeIter; addWwData(subTree); } }
void addWwData(InputTree* tree) { InputTrees& st = tree->subTrees(); InputTrees::iterator subTreeIter= st.begin(); InputTree *subTree; for( ; subTreeIter != st.end() ; subTreeIter++ ) { subTree = *subTreeIter; addWwData(subTree); } if( tree->word() != "" ) { ECString w = tree->word(); const Term* trm = Term::get(tree->term()); assert(trm); int trmInt = trm->toInt(); incrWordData(trmInt, w); } }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); assert(args.nargs() == 1); ECString path(args.arg(0)); cerr << "At start of pHsgt" << endl; for(int n = 0 ; n < MAXNUMNTS ; n++) numTerm[n] = 0; Term::init( path ); readHeadInfo(path); int sentenceCount = 0; ECString s1lex("^^"); ECString s1nm("S1"); int s1Int = Term::get(s1nm)->toInt(); UnitRules ur; ur.init(); while(cin) { //if(sentenceCount > 4000) break; if(sentenceCount%10000 == 0) cerr << sentenceCount << endl; InputTree parse; cin >> parse; //cerr << parse << endl; if(!cin) break; if(parse.length() == 0) break; EcSPairs wtList; parse.make(wtList); InputTree* par; par = &parse; addWwData(par); incrWordData(s1Int, s1lex); ur.gatherData(par); sentenceCount++; } ECString resultsString(path); resultsString += "pSgT.txt"; ofstream resultsStream(resultsString.c_str()); assert(resultsStream); int numWords = 0; resultsStream << " \n"; //leave space for number of words; resultsStream.precision(3); ECString lastWord; int wordFreq = 0; WordMap::iterator wmi = wordMap.begin(); resultsStream << wordMap.size() << "\n\n"; for( ; wmi != wordMap.end() ; wmi++) { ECString w = (*wmi).first; resultsStream << w << "\t"; PosD& posd = (*wmi).second; PosD::iterator pdi = posd.begin(); int count = 0; for( ; pdi != posd.end(); pdi++) { int posInt = (*pdi).first; int c = (*pdi).second; count += c; float p = (float)c/(float)numTerm[posInt]; resultsStream << posInt << " " << p << " "; } resultsStream << "| " << count << "\n"; } ur.setData(path); return 1; }
void addWwData(InputTree* tree) { bool okSit = true; if( tree->word() != "" ) { ECString wTagNm = tree->term(); const Term* trm = Term::get(wTagNm); int lhsInt = trm->toInt(); if(trm->openClass()) { ECString hdLexU(tree->word()); char temp[512]; ECString hdLex(langAwareToLower(hdLexU.c_str(),temp)); int len = hdLex.length(); if(len >= 4) { ECString e=lastCharacter(hdLex); // if the current count for lhs and e == 0, this is new; //cout<<hdLex<<endl; const WordInfo* wi = Pst::get(hdLex); //???; if(!wi) { assert(wi); } if(wi->c() <= 4) { incrEndData(lhsInt, e); numTerm[lhsInt]++; } } }//if openClass return; } /*ECString fixedTerm(tree->term()); if(fixedTerm == "") fixedTerm = "S1"; const Term* lhs = Term::get(fixedTerm); /* If we cannot recognize the term, don't abort, just warn and do not create a rule here or one level up. */ /*if(!lhs) { lhs = Term::get("GARBAGE"); okSit = false; cerr << "Garbage term: " << tree->term() << endl; } */ InputTrees& st = tree->subTrees(); InputTrees::iterator subTreeIter= st.begin(); InputTree *subTree; for( ; subTreeIter != st.end() ; subTreeIter++ ) { subTree = *subTreeIter; addWwData(subTree); } /* int lhsInt = lhs->toInt(); int k, l; k = Term::get(tree->headTree()->term())->toInt(); l = lhsInt - 1 - Term::lastTagInt(); data[k][l]++; return lhs;*/ }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); assert(args.nargs() == 1); ECString path(args.arg(0)); cerr << "At start of pSfgt" << endl; for(int n = 0 ; n < 140 ; n++) numTerm[n] = 0; ECString resultsString(path); resultsString += "endings.txt"; Term::init( path ); if(args.isset('L')) Term::Language = args.value('L'); readHeadInfo(path); Pst pst(path); //???; int sentenceCount = 0; int wordCount = 0; int processedCount = 0; /*int i, j; for(i = 0 ; i < 60 ; i++) for(j = 0 ; j < 30 ; j++) data[i][j] = 0; */ int i = 0; while(cin) { if(i++%5000 == 1) cerr << i << endl; InputTree parse; cin >> parse; if(!cin) break; if(parse.length() == 0 && cin) continue; if(parse.length()==0 ||!cin) break; addWwData(&parse); processedCount++; wordCount += parse.length(); } ofstream resultsStream(resultsString.c_str()); assert(resultsStream); /*int totNt[30]; for(i = 0 ; i < 30 ; i++) totNt[i] = 0; for(i = 0 ; i <= Term::lastTagInt() ; i++) { for(j = 0 ; j < (Term::lastNTInt() - Term::lastTagInt()) ; j++) totNt[j] += data[i][j]; } */ resultsStream << numEndings << "\n"; for(i = 0 ; i < 140 ; i++) { endMap::iterator emi = endData[i].begin(); for( ; emi != endData[i].end() ; emi++) { ECString ending = (*emi).first; int cnt = (*emi).second; resultsStream << i << "\t" << ending << "\t" << (float) cnt / (float) numTerm[i] << endl; //<< "\n"; } } cout<<"totol sentence:"<<processedCount<<endl; cout<<"total suffix:"<<numEndings<<endl; return 0; }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); assert(args.nargs() == 1); ECString path(args.arg(0)); cerr << "At start of pTgNt" << endl; for(int n = 0 ; n < MAXNUMTS ; n++) numTerm[n] = 0; ECString resultsString(path); resultsString += "endings.txt"; Term::init( path ); if(args.isset('L')) Term::Language = args.value('L'); readHeadInfo(path); Pst pst(path); int sentenceCount = 0; int wordCount = 0; int processedCount = 0; int i, j; for(i = 0 ; i < MAXNUMTS ; i++) for(j = 0 ; j < MAXNUMNTS ; j++) data[i][j] = 0; i = 0; while(cin) { if(i%10000 == 0) cerr << i << endl; //if(i > 1000) break; InputTree parse; cin >> parse; if(!cin) break; if(parse.length() == 0) break; const Term* resTerm = addWwData(&parse); processedCount++; wordCount += parse.length(); i++; } ofstream resultsStream(resultsString.c_str()); assert(resultsStream); int totNt[MAXNUMTS]; for(i = 0 ; i < MAXNUMTS ; i++) totNt[i] = 0; for(i = 0 ; i <= Term::lastTagInt() ; i++) { for(j = 0 ; j < (Term::lastNTInt() - Term::lastTagInt()) ; j++) totNt[j] += data[i][j]; } resultsStream << numEndings << "\n"; for(i = 0 ; i < MAXNUMTS ; i++) { endMap::iterator emi = endData[i].begin(); for( ; emi != endData[i].end() ; emi++) { ECString ending = (*emi).first; int cnt = (*emi).second; resultsStream << i << "\t" << ending << "\t" << (float) cnt / (float) numTerm[i] << endl; //<< "\n"; } } return 0; }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); ECString path(args.arg(0)); cerr << "At start of pUgT" << endl; Term::init( path ); if(args.isset('L')) Term::Language = args.value('L'); readHeadInfo(path); Pst pst(path); int sentenceCount = 0; int i, j; for(i = 0 ; i < MAXNUMTS ; i++) { posCounts[i] = 0; posCapCounts[i] = 0; posDenoms[i] = 0; posUCounts[i] = 0; posDashCounts[i] = 0; } for(i = 0 ; i < MAXNUMTS ; i++) totCounts[i] = 0; i = 0; for( ; ; ) { if(i++%10000 == 1) cerr << i << endl; //if(i > 1000) break; InputTree parse; cin >> parse; //cerr << parse << endl; if(parse.length() == 0) break; if(!cin) break; curSent = &parse; addWwData(&parse); sentenceCount++; } ECString resultsString(path); resultsString += "pUgT.txt"; ofstream resultsStream(resultsString.c_str()); assert(resultsStream); /* we print out p(unknown|tag) p(Capital|tag) p(hasDash|tag, unknown) note for Capital the denom is different because we ignore the first two words of the sentence */ int nm = Term::lastTagInt()+1; for(i = 0 ; i < nm ; i++) { resultsStream << i << "\t"; float pugt = 0; float pudenom = (float)posDenoms[i]; if(pudenom > 0) pugt = (float)posUCounts[i]/pudenom; resultsStream << pugt << "\t"; if(posCounts[i] == 0) resultsStream << 0 << "\t"; else resultsStream << (float) posCapCounts[i]/ (float)posCounts[i] << "\t"; if(posUCounts[i] == 0) resultsStream << 0; else resultsStream << (float)posDashCounts[i]/posUCounts[i] ; resultsStream << endl; } ECString resultsString2(path); resultsString2 += "nttCounts.txt"; ofstream resultsStream2(resultsString2.c_str()); assert(resultsStream2); for(i = 0 ; i <= Term::lastNTInt() ; i++) { resultsStream2 << i << "\t"; resultsStream2 << totCounts[i] << "\n"; } return 0; }