Пример #1
0
Dictionary<vector<FeatVal> > * KyteaModel::makeDictionaryFromPrefixes(const vector<KyteaString> & prefs, StringUtil* util, bool adjustPos) {
    typedef Dictionary<vector<FeatVal> >::WordMap WordMap;
    WordMap wm;
    int pos;
    for(int i = 0; i < (int)names_.size(); i++) {
        const KyteaString & str = names_[i];
        for(pos = 0; pos < (int)prefs.size() && !str.beginsWith(prefs[pos]); pos++);
        if(pos != (int)prefs.size()) {
            featuresAdded_++;
            KyteaString name = str.substr(prefs[pos].length());
            WordMap::iterator it = wm.find(name);
            if(it == wm.end()) {
                pair<WordMap::iterator, bool> p = wm.insert(WordMap::value_type(name,new vector<FeatVal>(prefs.size()*numW_)));
                it = p.first;
            }
            // If this is an n-gram dictionary, adjust the position according to
            // n-gram length, otherwise just use the location of th eprefix
            int id = (adjustPos ?
                (prefs.size()-pos-name.length())*numW_ :
                pos*numW_
            );
            for(int j = 0; j < numW_; j++) {
                // cerr << "adding for "<<util->showString(str)<<" @ "<<util->showString(name) << " ["<<id<<"]"<<"/"<<(*it->second).size()<<" == "<<getWeight(i,j)<<"/"<<weights_.size()<< " == " <<getWeight(i-1,j) * labels_[0]<<endl;
                (*it->second)[id+j] = getWeight(i-1,j) * labels_[0];
            }
        }
    }
    if(wm.size() > 0) {
        Dictionary<vector<FeatVal> > * ret = new Dictionary<vector<FeatVal> >(util);
        ret->buildIndex(wm);
        return ret;
    }
    return NULL;
}
Пример #2
0
// Look up values 
void FeatureLookup::addTagNgrams(const KyteaString & chars, 
                                 const Dictionary<FeatVec> * dict, 
                                 vector<FeatSum> & scores,
                                 int window, int startChar, int endChar) {
    if(!dict) return;
    // Create a substring that exactly covers the window that we are interested
    // in of up to -window characters before, and +window characters after
    int myStart = max(startChar-window,0);
    int myEnd = min(endChar+window,(int)chars.length());
    // cerr << "startChar=="<<startChar<<", endChar=="<<endChar<<", myStart=="<<myStart<<", myEnd=="<<myEnd<<endl;
    KyteaString str = 
        chars.substr(myStart, startChar-myStart) +
        chars.substr(endChar, myEnd-endChar);
    // Match the features in this substring
    Dictionary<FeatVec>::MatchResult res = dict->match(str);
    // Add up the sum of all the features
    // myStart-startChar is how far to the left of the starting character we are
    int offset = window-(startChar-myStart);
    for(int i = 0; i < (int)res.size(); i++) {
        // The position we are interested in is the matched position plus the
        // offset
        int pos = res[i].first + offset;
        // Reverse this and multiply by the number of candidates
        pos = (window*2 - pos - 1) * scores.size();
        FeatVal* vec = &((*res[i].second)[pos]);
        // Now add up all the values in the feature vector
        for(int j = 0; j < (int)scores.size(); j++) {
#ifdef KYTEA_SAFE
            if(j+pos >= (int)res[i].second->size() || j+pos < 0)
                THROW_ERROR("j+pos "<<j<<"+"<<pos<<" too big for res[i].second->size() "<<res[i].second->size()<<", window="<<window);
#endif
            scores[j] += vec[j];
        }
    }
}
Пример #3
0
// write out a language model
void TextModelIO::writeLM(const KyteaLM * lm) {
    // print a single endl for empty models
    if(lm == 0) {
        *str_ << endl;
        return;
    }
    *str_ << "lmn " << lm->n_ << endl;
    *str_ << "lmvocab " << lm->vocabSize_ << endl;
    
    KyteaChar spaceChar = util_->mapChar(" ");
    KyteaString nullString = util_->mapString(NULL_STRING);

    // sort the set of all keys
    set<KyteaString> keys;
    for(KyteaDoubleMap::const_iterator it = lm->probs_.begin(); it != lm->probs_.end(); it++)
        keys.insert(it->first);
    for(KyteaDoubleMap::const_iterator it = lm->fallbacks_.begin(); it != lm->fallbacks_.end(); it++)
        keys.insert(it->first);
    for(set<KyteaString>::const_iterator it = keys.begin(); it != keys.end(); it++) {
        KyteaDoubleMap::const_iterator fit = const_cast<KyteaLM*>(lm)->probs_.find(*it);
        KyteaString displayString;
        if(it->length() == 0)
            displayString = nullString;
        else {
            displayString = *it;
            // remove the null characters
            for(unsigned i = 0; i < displayString.length(); i++)
                if(!displayString[i])
                    displayString[i] = spaceChar;
        }
        *str_ << (fit == lm->probs_.end() ? NEG_INFINITY : fit->second) 
            << "\t" << util_->showString(displayString);
        fit = const_cast<KyteaLM*>(lm)->fallbacks_.find(*it);
        if(fit != lm->fallbacks_.end())
            *str_ << "\t" << fit->second;
        *str_ << endl;
    }
    *str_ << endl; 
}
Пример #4
0
typename Dictionary<Entry>::MatchResult Dictionary<Entry>::match( const KyteaString & chars ) const {
    const unsigned len = chars.length();
    unsigned currState = 0, nextState;
    MatchResult ret;
    for(unsigned i = 0; i < len; i++) {
        KyteaChar c = chars[i];
        while((nextState = states_[currState]->step(c)) == 0 && currState != 0)
            currState = states_[currState]->failure;
        currState = nextState;
        std::vector<unsigned> & output = states_[currState]->output;
        for(unsigned j = 0; j < output.size(); j++) 
            ret.push_back( std::pair<unsigned, Entry*>(i, entries_[output[j]]) );
    }
    return ret;
}