double Pst:: pstt(ECString& shU, int t, int word_num) { char temp[1024]; ECString sh(langAwareToLower(shU.c_str(), temp)); const Term* tTerm = Term::fromInt(t); double phst = pHst(sh, t); double ans; if(phst > 0) ans = psktt(shU, t, word_num); else ans = psutt(shU, t, word_num); return ans; }
void ClassRule:: readCRules(ECString path) { ECString flnm = path; flnm += L"rules.txt"; ifstream is(flnm.c_str()); int wh = 2; //wcerr << "RCR" << endl; int modm = Term::stopTerm->toInt(); assert(is); ECString tmp; for( ; ; ) { int d, m, r, t; is >> tmp; if(tmp == L"Thirds:") { wh = 3; continue; } //wcerr << "T1 " << tmp << endl; if(!is) break; d = Term::get(tmp)->toInt(); is >> tmp; m = Term::get(tmp)->toInt(); is >> r; r--; is >> tmp; t = Term::get(tmp)->toInt(); assert(is); ClassRule cr(d,m,r,t); //wcerr << "RR " << cr << endl; if(wh == 3) rBundles3_[d][m-modm].push_back(cr); else rBundles2_[d][m-modm].push_back(cr); } flnm = path; flnm += L"rules.m"; ifstream ism(flnm.c_str()); if(!ism) return; ism >> tmp; // all thirds; for( ; ; ) { ECString tmp; int d, m, t; ism >> tmp; //wcerr << "T1 " << tmp << endl; if(!ism) break; d = Term::get(tmp)->toInt(); ism >> tmp; m = Term::get(tmp)->toInt(); ism >> tmp; t = Term::get(tmp)->toInt(); assert(ism); ClassRule cr(d,m,0,t); rBundlesm_[d][m].push_back(cr); } }
ECString auxify(ECString wM, ECString trmM) { char temp[128]; ECString w = toUpper(wM.c_str(),temp); ECString trm = toUpper(trmM.c_str(),temp); cerr << "AUX!!!" << endl; assert(0); if( isVerb( trm ) ) { //cout << "saw verb " << trm << " " << wM << endl; if( isAux( w ) || hasAuxSuf( w ) ) { //cout << "was aux " << w << endl; return "AUX"; } else if( isAuxg( w ) ) { //cout << "was auxg " << w << endl; return "AUXG"; } } if(trm == "BES" || trm == "HVS") //??? strange tags in switchboard { assert(w == "'S" || w == "-S"); return "AUX"; } return trmM; }
float Pst:: pegt(ECString& sh, int t) { int len = sh.length(); if(len < 3) return .01; ECString e = sh.substr(len -2, 2); float phegt = pHegt(e,t); return phegt; }
bool hasAuxSuf( ECString word ) { size_t pos = word.find_first_of("\'"); if(pos == -1) return false; ECString apostrophe = word.substr(pos, word.length()-pos); for( int i = 0; suffixes[i]; i++) { if( apostrophe == suffixes[i] ) return true; } return false; }
double Pst:: pCapgt(const ECString& shU, int t, int word_num) { if(word_num == 0) return 1; //cerr << "pCapgt = " << pcap << endl; if(shU.length() < 2) return 1; //ignore words of length 1; char temp[1024]; ECString sh(langAwareToLower(shU.c_str(),temp)); bool cap = false; if(shU[0] != sh[0] && shU[1] == sh[1]) cap = true; double pcap = pHcapgt(t); return cap ? pcap : (1 - pcap); }
void Pst:: readTermProbs(ECString& pTString, ECString& pUstring) { ifstream pTstream(pTString.c_str()); assert(pTstream); ignoreComment(pTstream); ifstream pUstream(pUstring.c_str()); assert(pUstream); ignoreComment(pUstream); int i, j; for( i = 0 ; i <= Term::lastNTInt() ; i++ ) { int t; pUstream >> t; float p; pUstream >> p; pHugt(t) = p; pUstream >> p; if(p == 0) p = .00001; //Anything might be capitalized; pHcapgt(t) = p; pUstream >> p; pHhypgt(t) = p; } int numpT; pTstream >> numpT; pHegt_ = new Phegt[numpT]; egtSize_ = numpT; i = 0; while(pTstream) { int t; char e0; char e1; float p; pTstream >> t; if(!pTstream) break; assert(i < numpT); pTstream >> e0; pTstream >> e1; pTstream >> p; pHegt_[i].t = t; pHegt_[i].e[0] = e0; pHegt_[i].e[1] = e1; pHegt_[i].p = p; i++; } }
int tree_watpos(int pos) { if(pos < 0) { return nullWordInt; } ECString wrd = sentence[pos]->head(); char tmp[1024]; ECString wrdl=langAwareToLower(wrd.c_str(), tmp); const WordInfo* wi = Pst::get(wrdl); assert(wi); int ans = wi->toInt(); assert(ans >= 0); return ans; }
int FeatureTree:: readOneLevel0(istream& is, int c) { int nextInd; ECString nextIndStr; is >> nextIndStr; if(!is) return -1; if(nextIndStr == "Selected") return -1; nextInd = atoi(nextIndStr.c_str()); FeatureTree& nft = subtree.array_[c]; nft.ind_ = nextInd; nft.read(is,Feature::ftTree[Feature::whichInt].left); nft.back = this; return nextInd; }
double Pst:: psutt(const ECString& shU, int t, int word_num) { //cerr << "Unknown word: " << shU << " for tag: " << t << endl; double ans = pHugt(t); //cerr << "pHugt = " << ans << endl; if(ans == 0) return 0; double phyp = pHypgt(shU,t); ans *= phyp; //cerr << "pHypgt = " << phyp << endl; double phcp = pCapgt(shU,t, word_num); ans *= phcp; ans *= .000001; if(Term::fromInt(t)->openClass()) { char temp[1024]; ECString sh(langAwareToLower(shU.c_str(),temp)); float phegt = pegt(sh,t); if(phegt == 0) phegt = .00001; //if(phegt == 0) phegt = .00005; //cerr << "pegt( " << sh << " | " << t << " ) = " << phegt << endl; ans *= phegt; } else ans *= .00000001; //cerr << "psutt( " << shU << " | " << t << " ) = " << ans << endl; return ans; }
ewDciTokStrm:: ewDciTokStrm( const ECString& name ) : istr_(name.c_str()), useCin(0), savedWrd_( "" ), // holds not-yet-processed parts of current Wrd nextWrd_( "" ), // holds "on-deck" Wrd ellipFlag( 0 ), // counts how many dots are in an ellipsis parenFlag( 0 ) // ParenFlag = 0 except while words {}
int main(int argc, char *argv[]) { if (argc==1) { usage(argv[0]); return 1; } ECArgs args( argc, argv ); params.init( args ); int numThreads=DEFAULT_NTHREAD; if(args.isset('t')) numThreads = atoi(args.value('t').c_str()); TimeIt timeIt; ECString path( args.arg( 0 ) ); generalInit(path); ECString flnm = "dummy"; if(args.nargs()==2) flnm = args.arg(1); if(Bchart::tokenize) { if (args.nargs() == 1) { tokStream = new ewDciTokStrm(cin); } else { ifstream* stream = new ifstream(flnm.c_str()); tokStream = new ewDciTokStrm(*stream); } } if(args.nargs()==2) nontokStream = new ifstream(args.arg(1).c_str()); else nontokStream = &cin; pthread_t thread[MAXNUMTHREADS]; int id[MAXNUMTHREADS]; int i; for(i = 0 ; i < numThreads ; i++){ id[i]=i; pthread_create(&thread[i],0,mainLoop, &id[i]); } for(i=0; i<numThreads; i++){ pthread_join(thread[i],0); } pthread_exit(0); return 0; }
double Pst:: pHypgt(const ECString& shU, int t) { bool hyp = false; if( shU.find("-") >= 0) hyp = true; double phyp = pHhypgt(t); return hyp ? phyp : (1 - phyp); }
ECString::ECString(const ECString &rString, EC_U32 nSize) :m_pStr(EC_NULL) ,m_nSize(nSize) { m_pStr = new EC_CHAR[nSize+1]; if(m_pStr) { m_nSize = nSize; ECStringOP::StrNCopy(m_pStr, rString.ToCStr(), nSize); m_pStr[m_nSize] = 0; } }
double Pst:: psktt(const ECString& shU, int t, int word_num) { char temp[1024]; ECString sh(langAwareToLower(shU.c_str(), temp)); double ans = pHst(sh, t); double phcp = pCapgt(shU,t, word_num); ans *= phcp; double put = pHugt(t); ans *= (1-put); //cerr << "psktt( " << shU << " | " << t << " ) = " << ans << endl; return ans; }
double Pst:: pstt(ECString& shU, int t, int word_num) { char temp[MAXWORDLENGTH]; ECString sh(toLower(shU.c_str(), temp, MAXWORDLENGTH)); const Term* tTerm = Term::fromInt(t); double phst = pHst(sh, t); double ans; if(phst > 0) ans = psktt(shU, t, word_num); else ans = psutt(shU, t, word_num); return ans; }
int okFTag(ECString nc) { // static ECString ftgs[22] = {"TMP", "LOC", "ADV", "TPC", "BNF", "DIR", // "EXT", "NOM", "DTV", "LGS", "PRD", "PUT", // "SBJ", "VOC", "MNR", "PRP", "CLR", "CLF", // "HLN", "TTL", "DEI", "PLE"}; static ECString ftgs[20] = {"AO", "ATR", "CAG", "CC", "CCL", "CCQ", "CCT", "CD", "CI", "CPRED", "CREG", "ET", "Fn", "IMPERS", "MOD", "NEG", "NF", "PASS", "SUJ", "VOC"}; int i; //for(i = 0 ; i < 22 ; i++) if(nc == ftgs[i]) return 1; for(i = 0 ; i < 20 ; i++) if(nc.substr(0,ftgs[i].length()) == ftgs[i]) return 1; return 0; }
void incrWordData(int lhsInt, ECString wupper) { char temp[128]; ECString w(toLower(wupper.c_str(), temp)); numTerm[lhsInt]++; WordMap::iterator wmi = wordMap.find(w); if(wmi == wordMap.end()) { wordMap[w][lhsInt] = 1; return; } PosD& posd = (*wmi).second; PosD::iterator pdi = posd.find(lhsInt); if(pdi == posd.end()) { posd[lhsInt] = 1; } else (*pdi).second++; }
list<double> Pst:: wordPlistConstruct(const ECString& head, int word_num) { list<double> ans; char temp[1024]; ECString headL(langAwareToLower(head.c_str(), temp)); const WordInfo* wi = useHeadC( headL ); if( wi ) { int sz = wi->stSize(); for( int i = 0 ; i < sz ; i ++ ) { Phsgt& wti = wi->st_[i]; int tInt = wti.term; if(tInt > Term::lastTagInt()) continue; double prob = psktt(head,tInt,word_num); ans.push_back(tInt); ans.push_back(prob); if(prob == 0) cerr << "Warning, prob = 0 for word = " << head << " and pos = " << tInt << endl; //cerr << "wordPlist: " << word << "\t" << tInt // << "\t" << prob << endl; } } else { for(int i = 0 ; i <= Term::lastTagInt() ; i++) { double phut = pHugt(i); if(phut == 0) continue; double prob = psutt(head,i,word_num); ans.push_back(i); ans.push_back(prob); } } return ans; }
int whichEmpty(const ECString& emp) { //return 1; // should make system not require empty type to be correct. if(emp == "0") return NULLEMP; if(emp == "*U*") return UNITEMP; int sz = emp.length(); if(sz < 1) { return 0; } if(sz >= 5) { //ECString emp1(emp.substr(0, 5)); ECString emp1(emp,0,5); if(emp1 == "*NOT*") return NOTEMP; if(emp1 == "*RNR*") return RNREMP; if(emp1 == "*ICH*") return ICHEMP; if(emp1 == "*EXP*") return EXPEMP; if(emp1 == "*PPA*") return PPAEMP; } if(sz >= 3) { //ECString emp1(emp.substr(0, 3)); ECString emp1(emp,0, 3); if(emp1 == "*T*") { return TREMP; } if(emp1 == "*NOT*") return NOTEMP; if(emp1 == "*?*") return QEMP; } //ECString emp2(emp.substr(0,1)); ECString emp2(emp,0,1); if(emp2 == "*") return NPEMP; return 0; }
ECConfig::ECConfig(const EC_PCHAR pFilePath) :m_nCount(0) ,m_sFile(pFilePath) { if(EC_Err_None == m_sFile.Open((EC_PCHAR)"rt")) { ECString pKey; ECString pVal; ECString sCfgStr; EC_CHAR pCfgStr[CONFIG_MAX_ITEM_LENGTH] = {0}; EC_U32 nReadRet = m_sFile.ReadStrLine(pCfgStr, CONFIG_MAX_ITEM_LENGTH); while(EC_Err_None == nReadRet) { sCfgStr = pCfgStr; sCfgStr.Trim(); sCfgStr.TrimEnd(); if(m_nCount < CONFIG_MAX_ITEM_COUNT) { pKey = FindKey(sCfgStr.ToCStr()); pVal = FindVal(sCfgStr.ToCStr()); if(!pKey.IsNull() /*&& !pVal.IsNull()*/) { m_pConfigItem[m_nCount].m_sKey = pKey; m_pConfigItem[m_nCount].m_sVal = pVal; m_nCount++; } } else return; nReadRet = m_sFile.ReadStrLine(pCfgStr, CONFIG_MAX_ITEM_LENGTH); }; m_sFile.Close(); } }
void Feature:: init(ECString& path, ECString& conditioned) { assignCalc(conditioned); int f; for(f = 0 ; f < MAXNUMFS ; f++) { float* vec = new float[15]; lambdas_[whichInt][f] = vec; for(int k = 0 ; k < 15 ; k++) vec[k] = 0.0; } ECString dataECString(path); dataECString += "featInfo."; dataECString += conditioned; ifstream dataStrm(dataECString.c_str()); assert(dataStrm); int auxCnts[MAXNUMFS]; int i; for(i = 0 ; i < MAXNUMFS ; i++) auxCnts[i] = 0; Feature::ftTreeFromInt[whichInt][0] = &(Feature::ftTree[whichInt]); int conditionedInt; dataStrm >> conditionedInt; conditionedFeatureInt[whichInt] = conditionedInt; int num; for(num = 0 ; ; num++) { int n, subf, pos, cpr; ECString nm; ECString tmp; dataStrm >> tmp; if(tmp == "--") break; n = atoi(tmp.c_str()); dataStrm >> nm; dataStrm >> subf; dataStrm >> pos; dataStrm >> tmp; if(tmp == "|") cpr = -1; else { cpr = atoi(tmp.c_str()); dataStrm >> tmp; assert(tmp == "|"); } array_[whichInt][n-1] = new Feature(n, nm, subf, pos, cpr); array_[whichInt][n-1]->auxCnt = auxCnts[pos]; auxCnts[pos]++; createFTypeTree(Feature::ftTreeFromInt[whichInt][pos], n, whichInt); } Feature::total[whichInt] = num; for(num = 0 ; ; num++) { int n, fn; ECString nm; ECString tmp; dataStrm >> tmp; if(tmp == "--") break; n = atoi(tmp.c_str()); dataStrm >> nm; dataStrm >> fn; list<int> featList; for( ; ; ) { dataStrm >> tmp; if(tmp == "|") break; int f = atoi(tmp.c_str()); featList.push_back(f); } SubFeature::fromInt(n, whichInt) = new SubFeature(n, nm, fn, featList); assert(SubFeature::fromInt(n, whichInt)); } SubFeature::total[whichInt] = num; /* set the universal function num on feats from their subfeats */ for(num = 0 ; num < Feature::total[whichInt] ; num++) { Feature* f = array_[whichInt][num]; f->usubFeat = SubFeature::fromInt(f->subFeat,whichInt)->usf; } /* set up the table from universal subfeat nums to subfeat nums */ for(num = 0 ; num < MAXNUMFS ; num++) SubFeature::ufArray[whichInt][num] = -1; for(num = 0 ; num < SubFeature::total[whichInt] ; num++) { SubFeature* sf = SubFeature::fromInt(num,whichInt); SubFeature::ufArray[whichInt][sf->usf] = num; } }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); //AnsTreeHeap::print = true; /* o = basic, but not debugging, output. l = length of sentence to be proceeds 0-40 is default n = work on each #'th line. d = print out debugging info at level # t = report timings (requires o) */ int i; params.init( args ); //cerr << "Starting wwBCTest " << Feature::sLM << endl; if( args.nargs() > 2 || args.nargs() < 2 ) // require path name error( "Need exactly two args."); ECString path( args.arg( 0 ) ); generalInit(path); for(int i = 0 ; i < 500 ; i++) histPoints[i] = false; histPoints[0] = true; if (Bchart::Nth == 50 || Bchart::Nth == 500 || Bchart::Nth == 1000) histPoints[1] = histPoints[9] = histPoints[24] = histPoints[49] = true; if (Bchart::Nth == 500 || Bchart::Nth == 1000) histPoints[99] = histPoints[249] = histPoints[499] = true; if(Bchart::Nth == 1000) histPoints[749] = histPoints[999] = true; TimeIt timeIt; ECString testSString = args.arg(1); ifstream testSStream(testSString.c_str()); if( !testSStream ) { cerr << "Could not find " << testSString << endl; error( "No testSstream"); } ECString pstatStreamName( params.fileString()); pstatStreamName += "PStatInfo/pStat"; pstatStreamName += params.numString(); pstatStreamName += ".txt"; ofstream pstatStream( pstatStreamName.c_str(), ios::out); if( !pstatStream ) { cerr << "Looking to output to " << pstatStreamName << endl; error( "unable to open pstat stream"); } for(i = 0 ; i < MAXNUMTHREADS ; i++) globalGi[i] =NULL; pthread_t thread[MAXNUMTHREADS]; loopArg lA[MAXNUMTHREADS]; for(i = 0 ; i < numThreads ; i++){ lA[i].id = i; lA[i].inpt=&testSStream; lA[i].outpt=&pstatStream; pthread_create(&thread[i],0,mainLoop, (void*)&lA[i]); } for(i=0; i<numThreads; i++){ pthread_join(thread[i],0); } for(int i = 0 ; i < Bchart::Nth ; i++) if(histPoints[i]) { cerr << i << " " << totPst[i].fMeasure() << "\t"; } cerr << endl; if(Feature::isLM) { cerr << pow(2.0,totGram/(double)totWords); cerr <<"\t" << pow(2.0,totTri/(double)totWords); cerr << "\t" << pow(2.0,totMix/(double)(totWords)); cerr << endl; } if( args.isset('t') ) timeIt.finish(sentenceCount); pthread_exit(0); return 0; }
// returns whether string ends with pattern bool endsWith(ECString str, ECString pattern) { int index = str.rfind(pattern); return index == ((signed int)str.size() - (signed int)pattern.size()); }
static void* mainLoop(void* arg) { int *id = reinterpret_cast<int *>(arg); PrintStack printStack; for( ; ; ) { SentRep* srp = new SentRep(params.maxSentLen); pthread_mutex_lock(&readlock); if(Bchart::tokenize) *tokStream >> *srp; else *nontokStream >> *srp; int locCount = sentenceCount++; ExtPos extPos; if(params.extPosIfstream) extPos.read(params.extPosIfstream,*srp); pthread_mutex_unlock(&readlock); if( !params.field().in(sentenceCount) ) continue; printStruct printS; printS.name = srp->getName(); printS.sentenceCount = locCount; printS.numDiff = 0; int len = srp->length(); if (len == 0) { break; } if (len > params.maxSentLen) { ECString msg("skipping sentence longer than specified limit of "); msg += intToString(params.maxSentLen); WARN( msg.c_str() ); printSkipped(srp,NULL,printStack,printS); continue; } // handle input containing reserved word Bchart::HEADWORD_S1; could probably do // better (like undo replacement before printing) but this seems sufficient. int i; for (i = 0; i < len; ++i) { ECString& w = ((*srp)[i]).lexeme(); if (w == Bchart::HEADWORD_S1) { ECString msg = ECString("Replacing reserved token \"") + Bchart::HEADWORD_S1; msg += "\" at index " + intToString(i) + " of input with token \"^^^\""; WARN( msg.c_str() ); w = "^^^"; } } MeChart* chart = new MeChart( *srp,extPos,*id ); chart->parse( ); Item* topS = chart->topS(); if(!topS) { if (extPos.hasExtPos()) { WARN("Parse failed: !topS -- reparsing without POS constraints"); chart = new MeChart(*srp, *id); chart->parse(); topS = chart->topS(); if (!topS) { WARN("Reparsing without POS constraints failed too: !topS"); printSkipped(srp, chart, printStack, printS); continue; } } else { WARN( "Parse failed: !topS" ); printSkipped(srp,chart,printStack,printS); continue; } } bool failed = decodeParses(len, locCount, srp, chart, printS, printStack); if (failed) { continue; } if( printS.numDiff == 0) { if (extPos.hasExtPos()) { WARN("Parse failed from 0, inf or nan probabililty -- reparsing without POS constraints"); chart = new MeChart(*srp, *id); chart->parse(); bool failed = decodeParses(len, locCount, srp, chart, printS, printStack); if (failed || printS.numDiff == 0) { WARN("Parse failed from 0, inf or nan probabililty -- failed even without POS constraints"); printSkipped(srp,chart,printStack,printS); continue; } } else { WARN("Parse failed from 0, inf or nan probabililty"); printSkipped(srp,chart,printStack,printS); continue; } } /* put the sentence with which we just finished at the end of the printStack*/ printStack.push_back(printS); workOnPrintStack(&printStack); delete chart; delete srp; }