Ejemplo n.º 1
0
            bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const
            {
                if(end == begin)
                {
                    return false;
                }
                assert(_getInitFlag());
                vector<SegmentChar> segmentChars(end - begin);

                //calc DAG
                for(size_t i = 0; i < segmentChars.size(); i ++)
                {
                    segmentChars[i].uniCh = *(begin + i);
                    segmentChars[i].dag.clear();
                    _dictTrie.find(begin + i, end, segmentChars[i].dag, i);
                    segmentChars[i].dag.insert(make_pair<DagType::key_type, DagType::mapped_type>(i, NULL));
                }

                _calcDP(segmentChars);

                if(!_cut(segmentChars, res))
                {
                    LogError("_cut failed.");
                    return false;
                }

                return true;
            }
Ejemplo n.º 2
0
            bool tag(const string& src, vector<pair<string, string> >& res)
            {
                assert(_getInitFlag());
                vector<string> cutRes;
                if (!_segment.cut(src, cutRes))
                {
                    LogError("_mixSegment cut failed");
                    return false;
                }

                const DictUnit *tmp = NULL;
                Unicode unico;
                for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
                {
                    if (!TransCode::decode(*itr, unico))
                    {
                        LogError("decode failed.");
                        return false;
                    }
                    tmp = _dictTrie.find(unico.begin(), unico.end());
                    res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag));
                }
                tmp = NULL;
                return !res.empty();
            }
Ejemplo n.º 3
0
            bool tag(const string& src, vector<pair<string, string> >& res) const
            {
                vector<string> cutRes;
                if (!_segment.cut(src, cutRes))
                {
                  Rcout<<"_mixSegment cut failed"<<std::endl;
      
                    return false;
                }

                const DictUnit *tmp = NULL;
                Unicode unico;
                for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
                {
                    if (!TransCode::decode(*itr, unico))
                    {
                      Rcout<<"decode failed."<<std::endl;
        
                        return false;
                    }
                    tmp = _dictTrie->find(unico.begin(), unico.end());
                    if(tmp == NULL || tmp->tag.empty())
                    {
                      res.push_back(make_pair(*itr, _specialRule(unico)));
                    }
                    else
                    {
                      res.push_back(make_pair(*itr, tmp->tag));
                    }                }
                return !res.empty();
            }
Ejemplo n.º 4
0
 bool init(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string& transProb)
 {
     
     assert(!_getInitFlag());
     _dictTrie.init(dictPath);
     assert(_dictTrie);
     return _setInitFlag(_segment.init(dictPath, hmmFilePath));
 };
Ejemplo n.º 5
0
 bool init(const string& dictPath, const string& userDictPath = "")
 {
     if(_getInitFlag())
     {
         LogError("already inited before now.");
         return false;
     }
     _dictTrie.init(dictPath, userDictPath);
     assert(_dictTrie);
     LogInfo("MPSegment init(%s) ok", dictPath.c_str());
     return _setInitFlag(true);
 }
Ejemplo n.º 6
0
 bool _calcDAG(Unicode::const_iterator begin, Unicode::const_iterator end, vector<SegmentChar>& SegmentChars) const
 {
     SegmentChar schar;
     size_t offset;
     for(Unicode::const_iterator it = begin; it != end; it++)
     {
         schar.uniCh = *it;
         offset = it - begin;
         schar.dag.clear();
         _dictTrie.find(it, end, schar.dag, offset);
         if(!isIn(schar.dag, offset))
         {
             schar.dag[offset] = NULL;
         }
         SegmentChars.push_back(schar);
     }
     return true;
 }
Ejemplo n.º 7
0
            bool _calcDP(vector<SegmentChar>& SegmentChars)const
            {
                if(SegmentChars.empty())
                {
                    LogError("SegmentChars empty");
                    return false;
                }

                size_t nextPos;
                const DictUnit* p;
                double val;

                for(int i = SegmentChars.size() - 1; i >= 0; i--)
                {
                    SegmentChars[i].pInfo = NULL;
                    SegmentChars[i].weight = MIN_DOUBLE;
                    for(DagType::const_iterator it = SegmentChars[i].dag.begin(); it != SegmentChars[i].dag.end(); it++)
                    {
                        nextPos = it->first;
                        p = it->second;
                        val = 0.0;
                        if(nextPos + 1 < SegmentChars.size())
                        {
                            val += SegmentChars[nextPos + 1].weight;
                        }

                        if(p)
                        {
                            val += p->weight; 
                        }
                        else
                        {
                            val += _dictTrie.getMinWeight();
                        }
                        if(val > SegmentChars[i].weight)
                        {
                            SegmentChars[i].pInfo = p;
                            SegmentChars[i].weight = val;
                        }
                    }
                }
                return true;

            }
Ejemplo n.º 8
0
TEST(DictTrieTest, Test1) {

    string s1, s2;
    DictTrie trie;
    trie.init(DICT_FILE);
    ASSERT_LT(trie.getMinWeight() + 15.6479, 0.001);
    string word("来到");
    Unicode uni;
    ASSERT_TRUE(TransCode::decode(word, uni));
    DictUnit nodeInfo;
    nodeInfo.word = uni;
    nodeInfo.tag = "v";
    nodeInfo.weight = -8.87033;
    s1 << nodeInfo;
    s2 << (*trie.find(uni.begin(), uni.end()));

    EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
    word = "清华大学";
    LocalVector<pair<size_t, const DictUnit*> > res;
    //vector<pair<size_t, const DictUnit* > resMap;
    LocalVector<pair<size_t, const DictUnit*> > res2;
    const char * words[] = {"清", "清华", "清华大学"};
    for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
        ASSERT_TRUE(TransCode::decode(words[i], uni));
        res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
        //resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
    }
    //DictUnit
    //res.push_back(make_pair(0, ))

    vector<pair<size_t, const DictUnit*> > vec;
    ASSERT_TRUE(TransCode::decode(word, uni));
    ASSERT_TRUE(trie.find(uni.begin(), uni.end(), res2, 0));
    s1 << res;
    s2 << res;
    ASSERT_EQ(s1, s2);
}
Ejemplo n.º 9
0
 bool isUserDictSingleChineseWord(const Unicode::value_type & value) const
 {
     return _dictTrie.isUserDictSingleChineseWord(value);
 }
Ejemplo n.º 10
0
 bool init(const string& dictPath, const string& userDictPath = "")
 {
     LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath));
     LogInfo("MPSegment init(%s) ok", dictPath.c_str());
     return true;
 }