Dictionary<vector<FeatVal> > * KyteaModel::makeDictionaryFromPrefixes(const vector<KyteaString> & prefs, StringUtil* util, bool adjustPos) { typedef Dictionary<vector<FeatVal> >::WordMap WordMap; WordMap wm; int pos; for(int i = 0; i < (int)names_.size(); i++) { const KyteaString & str = names_[i]; for(pos = 0; pos < (int)prefs.size() && !str.beginsWith(prefs[pos]); pos++); if(pos != (int)prefs.size()) { featuresAdded_++; KyteaString name = str.substr(prefs[pos].length()); WordMap::iterator it = wm.find(name); if(it == wm.end()) { pair<WordMap::iterator, bool> p = wm.insert(WordMap::value_type(name,new vector<FeatVal>(prefs.size()*numW_))); it = p.first; } // If this is an n-gram dictionary, adjust the position according to // n-gram length, otherwise just use the location of th eprefix int id = (adjustPos ? (prefs.size()-pos-name.length())*numW_ : pos*numW_ ); for(int j = 0; j < numW_; j++) { // cerr << "adding for "<<util->showString(str)<<" @ "<<util->showString(name) << " ["<<id<<"]"<<"/"<<(*it->second).size()<<" == "<<getWeight(i,j)<<"/"<<weights_.size()<< " == " <<getWeight(i-1,j) * labels_[0]<<endl; (*it->second)[id+j] = getWeight(i-1,j) * labels_[0]; } } } if(wm.size() > 0) { Dictionary<vector<FeatVal> > * ret = new Dictionary<vector<FeatVal> >(util); ret->buildIndex(wm); return ret; } return NULL; }
void loadFile (string fileName) { ifstream fin; fin.open (fileName.c_str()); if (!fin.fail()){ char nextChar; bool add = true; string newWord(""); while (fin.good()){ nextChar = fin.get(); if (nextChar != fin.eof()){ if (nextChar == ' '){ add = false; wordmap.insert(newWord); //cout << newWord << endl; } else if (nextChar == '\n'){ newWord = (""); add = true; } else if (add){ newWord+=nextChar; } } } } }
bool Nce::parseFile(const QString &nce, int _class) { QString path = qApp->applicationDirPath(); #ifdef Q_OS_MAC path += "/../../.."; #endif path += "/nce/"; path += nce + "/"; QString classStr = QString::number(_class); if (_class < 10) classStr = "0" + QString::number(_class); path += classStr; QFileInfo fileInfo(path + ".txt"); if (!fileInfo.exists()) path += ".TXT"; else path += ".txt"; file_.setFileName(path); if (!file_.open(QIODevice::ReadOnly)) { qDebug() << "Read File Error!"+ path; return false; } QString text = file_.readAll(); if (file_.isOpen()) file_.close(); text = simpleChange(text); QStringList sentenceList = text.split(QRegExp("[\\.!\\?]"),QString::SkipEmptyParts); int sentenceSize = sentenceList.size(); ClassIndex index; index.nce = nce; index.class_ = _class; data_.classContent_.insert(index,text); WordMap wordMap; foreach (QString sentence, sentenceList) { QStringList wordList; wordList = sentence.split(QRegExp("\\W+"), QString::SkipEmptyParts); NceWordInfo wordInfo; wordInfo.index = index; // wordInfo.sentences.push_back(sentence); foreach (QString w, wordList) { Word word; word.index = index; word.word = w; wordMap.insert(word,wordInfo); for (WordMap::iterator it = wordMap.begin(), ie = wordMap.end(); it != ie; ++it){ if (it.key() == word) { NceWordInfo& info = it.value(); info.sentences.push_back(sentence); info.sentences.removeDuplicates(); } } }
void readWords(const LetterMap &letterMap, WordMap &words) { ifstream inf("words.txt"); string word, sortedWord; short value; while( getline(inf, word)) { if(word.length() <= MAX_WORD_LENGTH && word.find_first_not_of("abcedefghijklmnopqrstuvwxyz") == string::npos) { value = 0; for(string::const_iterator itr = word.begin(); itr != word.end(); itr++) value += (letterMap.find(*itr))->second; words.insert(WordMap::value_type(word, value)); } // if word up to MAX_WORD_LENGTH characters and no captial letters. } // while } // readWords()