Dictionary<vector<FeatVal> > * KyteaModel::makeDictionaryFromPrefixes(const vector<KyteaString> & prefs, StringUtil* util, bool adjustPos) { typedef Dictionary<vector<FeatVal> >::WordMap WordMap; WordMap wm; int pos; for(int i = 0; i < (int)names_.size(); i++) { const KyteaString & str = names_[i]; for(pos = 0; pos < (int)prefs.size() && !str.beginsWith(prefs[pos]); pos++); if(pos != (int)prefs.size()) { featuresAdded_++; KyteaString name = str.substr(prefs[pos].length()); WordMap::iterator it = wm.find(name); if(it == wm.end()) { pair<WordMap::iterator, bool> p = wm.insert(WordMap::value_type(name,new vector<FeatVal>(prefs.size()*numW_))); it = p.first; } // If this is an n-gram dictionary, adjust the position according to // n-gram length, otherwise just use the location of th eprefix int id = (adjustPos ? (prefs.size()-pos-name.length())*numW_ : pos*numW_ ); for(int j = 0; j < numW_; j++) { // cerr << "adding for "<<util->showString(str)<<" @ "<<util->showString(name) << " ["<<id<<"]"<<"/"<<(*it->second).size()<<" == "<<getWeight(i,j)<<"/"<<weights_.size()<< " == " <<getWeight(i-1,j) * labels_[0]<<endl; (*it->second)[id+j] = getWeight(i-1,j) * labels_[0]; } } } if(wm.size() > 0) { Dictionary<vector<FeatVal> > * ret = new Dictionary<vector<FeatVal> >(util); ret->buildIndex(wm); return ret; } return NULL; }
// Look up values void FeatureLookup::addTagNgrams(const KyteaString & chars, const Dictionary<FeatVec> * dict, vector<FeatSum> & scores, int window, int startChar, int endChar) { if(!dict) return; // Create a substring that exactly covers the window that we are interested // in of up to -window characters before, and +window characters after int myStart = max(startChar-window,0); int myEnd = min(endChar+window,(int)chars.length()); // cerr << "startChar=="<<startChar<<", endChar=="<<endChar<<", myStart=="<<myStart<<", myEnd=="<<myEnd<<endl; KyteaString str = chars.substr(myStart, startChar-myStart) + chars.substr(endChar, myEnd-endChar); // Match the features in this substring Dictionary<FeatVec>::MatchResult res = dict->match(str); // Add up the sum of all the features // myStart-startChar is how far to the left of the starting character we are int offset = window-(startChar-myStart); for(int i = 0; i < (int)res.size(); i++) { // The position we are interested in is the matched position plus the // offset int pos = res[i].first + offset; // Reverse this and multiply by the number of candidates pos = (window*2 - pos - 1) * scores.size(); FeatVal* vec = &((*res[i].second)[pos]); // Now add up all the values in the feature vector for(int j = 0; j < (int)scores.size(); j++) { #ifdef KYTEA_SAFE if(j+pos >= (int)res[i].second->size() || j+pos < 0) THROW_ERROR("j+pos "<<j<<"+"<<pos<<" too big for res[i].second->size() "<<res[i].second->size()<<", window="<<window); #endif scores[j] += vec[j]; } } }
// write out a language model void TextModelIO::writeLM(const KyteaLM * lm) { // print a single endl for empty models if(lm == 0) { *str_ << endl; return; } *str_ << "lmn " << lm->n_ << endl; *str_ << "lmvocab " << lm->vocabSize_ << endl; KyteaChar spaceChar = util_->mapChar(" "); KyteaString nullString = util_->mapString(NULL_STRING); // sort the set of all keys set<KyteaString> keys; for(KyteaDoubleMap::const_iterator it = lm->probs_.begin(); it != lm->probs_.end(); it++) keys.insert(it->first); for(KyteaDoubleMap::const_iterator it = lm->fallbacks_.begin(); it != lm->fallbacks_.end(); it++) keys.insert(it->first); for(set<KyteaString>::const_iterator it = keys.begin(); it != keys.end(); it++) { KyteaDoubleMap::const_iterator fit = const_cast<KyteaLM*>(lm)->probs_.find(*it); KyteaString displayString; if(it->length() == 0) displayString = nullString; else { displayString = *it; // remove the null characters for(unsigned i = 0; i < displayString.length(); i++) if(!displayString[i]) displayString[i] = spaceChar; } *str_ << (fit == lm->probs_.end() ? NEG_INFINITY : fit->second) << "\t" << util_->showString(displayString); fit = const_cast<KyteaLM*>(lm)->fallbacks_.find(*it); if(fit != lm->fallbacks_.end()) *str_ << "\t" << fit->second; *str_ << endl; } *str_ << endl; }
typename Dictionary<Entry>::MatchResult Dictionary<Entry>::match( const KyteaString & chars ) const { const unsigned len = chars.length(); unsigned currState = 0, nextState; MatchResult ret; for(unsigned i = 0; i < len; i++) { KyteaChar c = chars[i]; while((nextState = states_[currState]->step(c)) == 0 && currState != 0) currState = states_[currState]->failure; currState = nextState; std::vector<unsigned> & output = states_[currState]->output; for(unsigned j = 0; j < output.size(); j++) ret.push_back( std::pair<unsigned, Entry*>(i, entries_[output[j]]) ); } return ret; }