Dictionary<vector<FeatVal> > * KyteaModel::makeDictionaryFromPrefixes(const vector<KyteaString> & prefs, StringUtil* util, bool adjustPos) { typedef Dictionary<vector<FeatVal> >::WordMap WordMap; WordMap wm; int pos; for(int i = 0; i < (int)names_.size(); i++) { const KyteaString & str = names_[i]; for(pos = 0; pos < (int)prefs.size() && !str.beginsWith(prefs[pos]); pos++); if(pos != (int)prefs.size()) { featuresAdded_++; KyteaString name = str.substr(prefs[pos].length()); WordMap::iterator it = wm.find(name); if(it == wm.end()) { pair<WordMap::iterator, bool> p = wm.insert(WordMap::value_type(name,new vector<FeatVal>(prefs.size()*numW_))); it = p.first; } // If this is an n-gram dictionary, adjust the position according to // n-gram length, otherwise just use the location of th eprefix int id = (adjustPos ? (prefs.size()-pos-name.length())*numW_ : pos*numW_ ); for(int j = 0; j < numW_; j++) { // cerr << "adding for "<<util->showString(str)<<" @ "<<util->showString(name) << " ["<<id<<"]"<<"/"<<(*it->second).size()<<" == "<<getWeight(i,j)<<"/"<<weights_.size()<< " == " <<getWeight(i-1,j) * labels_[0]<<endl; (*it->second)[id+j] = getWeight(i-1,j) * labels_[0]; } } } if(wm.size() > 0) { Dictionary<vector<FeatVal> > * ret = new Dictionary<vector<FeatVal> >(util); ret->buildIndex(wm); return ret; } return NULL; }
void processBlank(const WordMap &words, string &originalWord, set<string> &bestWords, short &bestCount, const LetterMap &letterMap) { bestCount = 0; string word = originalWord; int pos = word.find(' '); for(char letter = 'a'; letter <= 'z'; letter++) { string temp(1, letter); word.replace(pos, 1, temp); for(int i = 0; i < MAX_PERMUTATIONS; i++) { for(unsigned int j = 1; j <= word.length(); j++) { string shortenedWord; shortenedWord.assign(word, 0, j); WordMap::const_iterator itr = words.find(shortenedWord); if(itr != words.end()) processBlankWord(words, originalWord, bestWords, bestCount, letterMap, itr, letter); } // for each word size next_permutation(word.begin(), word.end()); } // for i } // for each letter in blank } // processBlank()
void processNormal(const WordMap &words, string word, set<string> &bestWords, short &bestCount) { bestCount = 0; for(int i = 0; i < MAX_PERMUTATIONS; i++) { for(unsigned int j = 1; j <= word.length(); j++) { string shortenedWord; shortenedWord.assign(word, 0, j); WordMap::const_iterator itr = words.find(shortenedWord); if(itr != words.end() && itr->second >= bestCount) { if(itr->second > bestCount) { bestWords.clear(); bestCount = itr->second; } // if better than those previous bestWords.insert(itr->first); } // if at least as good as previous } // for each word size next_permutation(word.begin(), word.end()); } // for i } // processNormal()
void Dictionary<Entry>::buildIndex(const WordMap & input) { if(input.size() == 0) THROW_ERROR("Cannot build dictionary for no input"); clearData(); states_.push_back(new DictionaryState()); buildGoto(input.begin(), input.end(), 0, 0); buildFailures(); }
bool Nce::parseFile(const QString &nce, int _class) { QString path = qApp->applicationDirPath(); #ifdef Q_OS_MAC path += "/../../.."; #endif path += "/nce/"; path += nce + "/"; QString classStr = QString::number(_class); if (_class < 10) classStr = "0" + QString::number(_class); path += classStr; QFileInfo fileInfo(path + ".txt"); if (!fileInfo.exists()) path += ".TXT"; else path += ".txt"; file_.setFileName(path); if (!file_.open(QIODevice::ReadOnly)) { qDebug() << "Read File Error!"+ path; return false; } QString text = file_.readAll(); if (file_.isOpen()) file_.close(); text = simpleChange(text); QStringList sentenceList = text.split(QRegExp("[\\.!\\?]"),QString::SkipEmptyParts); int sentenceSize = sentenceList.size(); ClassIndex index; index.nce = nce; index.class_ = _class; data_.classContent_.insert(index,text); WordMap wordMap; foreach (QString sentence, sentenceList) { QStringList wordList; wordList = sentence.split(QRegExp("\\W+"), QString::SkipEmptyParts); NceWordInfo wordInfo; wordInfo.index = index; // wordInfo.sentences.push_back(sentence); foreach (QString w, wordList) { Word word; word.index = index; word.word = w; wordMap.insert(word,wordInfo); for (WordMap::iterator it = wordMap.begin(), ie = wordMap.end(); it != ie; ++it){ if (it.key() == word) { NceWordInfo& info = it.value(); info.sentences.push_back(sentence); info.sentences.removeDuplicates(); } } }
void Console::printWordMap(const WordMap &wordMap) { Common::StringArray words; WordMap::const_iterator verb; for (verb = wordMap.begin(); verb != wordMap.end(); ++verb) words.push_back(Common::String::format("%s: %3d", toAscii(verb->_key).c_str(), wordMap[verb->_key])); Common::sort(words.begin(), words.end()); debugPrintColumns(words); }
int main(int argc, char* argv[]) { typedef map<string, int> WordMap; typedef WordMap::iterator WMIter; const char* fname = "WordCount.cpp"; if(argc > 1) fname = argv[1]; ifstream in(fname); assure(in, fname); WordMap wordmap; string word; while(in >> word) wordmap[word]++; for(WMIter w = wordmap.begin(); w != wordmap.end(); w++) cout << w->first << ": " << w->second << endl; } ///:~
int main(int argc, char* argv[]) { char* fname = "WordCount.cpp"; if(argc > 1) fname = argv[1]; ifstream in(fname); assure(in, fname); StreamTokenizer words(in); WordMap wordmap; string word; while((word = words.next()).size() != 0) wordmap[word]++; for(WMIter w = wordmap.begin(); w != wordmap.end(); w++) cout << (*w).first << ": " << (*w).second.val() << endl; } ///:~
int main(int argc, char* argv[]) { requireArgs(argc, 1); ifstream in(argv[1]); assure(in, argv[1]); StreamTokenizer words(in); WordMap wordmap; string word; while ((word = words.next()).size() != 0) wordmap[word]++; for (WMIter w = wordmap.begin(); w != wordmap.end(); w++) cout << (*w).first << ": " << (*w).second.val() << endl; } ///:~
void incrWordData(int lhsInt, ECString wupper) { char temp[128]; ECString w(toLower(wupper.c_str(), temp)); numTerm[lhsInt]++; WordMap::iterator wmi = wordMap.find(w); if(wmi == wordMap.end()) { wordMap[w][lhsInt] = 1; return; } PosD& posd = (*wmi).second; PosD::iterator pdi = posd.find(lhsInt); if(pdi == posd.end()) { posd[lhsInt] = 1; } else (*pdi).second++; }
int WordRectFinder<MapT>::findWordRectRowsMapUpper(int haveTall, const WordMap& rowMap, char wordCols[][sBufSize]) { mNowTall = haveTall; // rectangle height == stack height int wantWide = mRowTrie.getWordLength(); int wantTall = mColTrie.getWordLength(); if (wantTall == haveTall) { // Success: the row just added made words of all columns return mWantArea; // Return the area } if (haveTall > 2 && mWantArea <= WordRectSearchMgr<MapT>::getTrumpingArea()) { return -mWantArea; // Abort because a wordRect bigger than wantArea has been found } char temp[sBufSize]; int area = 0; for(WordMap::const_iterator itr = rowMap.begin(), end = rowMap.end(); itr != end; ) { const char *word = itr->first; for (int k = 0; k < wantWide; k++) { wordCols[k][haveTall] = temp[k] = word[k]; if ( ! mColTrie.subTrix(wordCols[k], haveTall) ) { temp[k+1] = '{'; // ASCII decimal 123, the char after 'z' temp[k+2] = '\0'; // NULL-terminate the C-string itr = rowMap.upper_bound(temp); // Get first word-node alphabetically > temp goto END_LOOP; // break out of both inner and outer loop } } mRowWordsNow[haveTall] = word; area = findWordRectRowsMapUpper(haveTall+1, rowMap, wordCols); if (area > 0) { return area; } ++itr; END_LOOP:; } return 0; }
int main(int argc, char *argv[]) { ECArgs args( argc, argv ); assert(args.nargs() == 1); ECString path(args.arg(0)); cerr << "At start of pHsgt" << endl; for(int n = 0 ; n < MAXNUMNTS ; n++) numTerm[n] = 0; Term::init( path ); readHeadInfo(path); int sentenceCount = 0; ECString s1lex("^^"); ECString s1nm("S1"); int s1Int = Term::get(s1nm)->toInt(); UnitRules ur; ur.init(); while(cin) { //if(sentenceCount > 4000) break; if(sentenceCount%10000 == 0) cerr << sentenceCount << endl; InputTree parse; cin >> parse; //cerr << parse << endl; if(!cin) break; if(parse.length() == 0) break; EcSPairs wtList; parse.make(wtList); InputTree* par; par = &parse; addWwData(par); incrWordData(s1Int, s1lex); ur.gatherData(par); sentenceCount++; } ECString resultsString(path); resultsString += "pSgT.txt"; ofstream resultsStream(resultsString.c_str()); assert(resultsStream); int numWords = 0; resultsStream << " \n"; //leave space for number of words; resultsStream.precision(3); ECString lastWord; int wordFreq = 0; WordMap::iterator wmi = wordMap.begin(); resultsStream << wordMap.size() << "\n\n"; for( ; wmi != wordMap.end() ; wmi++) { ECString w = (*wmi).first; resultsStream << w << "\t"; PosD& posd = (*wmi).second; PosD::iterator pdi = posd.begin(); int count = 0; for( ; pdi != posd.end(); pdi++) { int posInt = (*pdi).first; int c = (*pdi).second; count += c; float p = (float)c/(float)numTerm[posInt]; resultsStream << posInt << " " << p << " "; } resultsStream << "| " << count << "\n"; } ur.setData(path); return 1; }