Exemple #1
0
int main(int argc, char ** argv)
{
    if(argc < 2)
    {
        cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
            <<"options:\n"
            <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
            <<"\t--dictpath\tsee example\n"
            <<"\t--modelpath\tsee example\n"
            <<"example:\n"
            <<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8\n"
            <<"\t"<<argv[0]<<" testlines.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutHMM\n"
            <<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutMix\n"
            <<endl;
        
        return EXIT_FAILURE;
    }
    ArgvContext arg(argc, argv);
    string dictPath = arg["--dictpath"];
    string modelPath = arg["--modelpath"];
    string algorithm = arg["--algorithm"];

    if("cutHMM" == algorithm)
    {
        HMMSegment seg;
        if(!seg.init(modelPath.c_str()))
        {
            cout<<"seg init failed."<<endl;
            return EXIT_FAILURE;
        }
        cut(&seg, arg[1].c_str());
        seg.dispose();
    }
    else if("cutMix" == algorithm)
    {
        MixSegment seg;
        if(!seg.init(dictPath.c_str(), modelPath.c_str()))
        {
            cout<<"seg init failed."<<endl;
            return EXIT_FAILURE;
        }
        cut(&seg, arg[1].c_str());
        seg.dispose();
    }
    else
    {
        MPSegment seg;
        if(!seg.init(dictPath.c_str()))
        {
            cout<<"seg init failed."<<endl;
            return false;
        }
        cut(&seg, arg[1].c_str());
        seg.dispose();
    }
    return EXIT_SUCCESS;
}
Exemple #2
0
 bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
 {
     LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
     LIMONP_CHECK(_hmmSeg.init(hmmSegDict));
     // LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
     return true;
 }
Exemple #3
0
            virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
            {
                vector<Unicode> words;
                words.reserve(end - begin);
                if(!_mpSeg.cut(begin, end, words))
                {
                  Rcout<<"mpSeg cutDAG failed."<<std::endl;
                
                    return false;
                }

                vector<Unicode> hmmRes;
                hmmRes.reserve(end - begin);
                Unicode piece;
                piece.reserve(end - begin);
                for (size_t i = 0, j = 0; i < words.size(); i++)
                {
                    //if mp get a word, it's ok, put it into result
                    if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0])))
                    {
                        res.push_back(words[i]);
                        continue;
                    }

                    // if mp get a single one and it is not in userdict, collect it in sequence
                    j = i;
                    while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0]))
                    {
                        piece.push_back(words[j][0]);
                        j++;
                    }

                    // cut the sequence with hmm
                    if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes))
                    {
                      Rcout<<"_hmmSeg cut failed."<<std::endl;
            
                        return false;
                    }

                    //put hmm result to result
                    for (size_t k = 0; k < hmmRes.size(); k++)
                    {
                        res.push_back(hmmRes[k]);
                    }

                    //clear tmp vars
                    piece.clear();
                    hmmRes.clear();

                    //let i jump over this piece
                    i = j - 1;
                }
                return true;
            }
Exemple #4
0
        bool dispose()
        {
#ifndef NO_CODING_LOG
            if(!_getInitFlag())
            {
                return true;
            }
#endif
            _fullSeg.dispose();
            _hmmSeg.dispose();
            _setInitFlag(false);
            return true;
        }
Exemple #5
0
int main(int argc, char ** argv)
{
    //demo
    {
        HMMSegment seg;
        if(!seg.init("../dicts/hmm_model.utf8"))
        {
            cout<<"seg init failed."<<endl;
            return EXIT_FAILURE;
        }
        cut(&seg, "testlines.utf8");
        seg.dispose();
    }
    {
        MixSegment seg;
        if(!seg.init("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8"))
        {
            cout<<"seg init failed."<<endl;
            return EXIT_FAILURE;
        }
        cut(&seg, "testlines.utf8");
		cout<<"Cut type"<<endl;
		cut_type(&seg, "testlines.utf8");
		cout<<endl;
        seg.dispose();
    }
    {
        MPSegment seg;
        if(!seg.init("../dicts/jieba.dict.utf8"))
        {
            cout<<"seg init failed."<<endl;
            return false;
        }
        cut(&seg, "testlines.utf8");
        seg.dispose();
    }
    return EXIT_SUCCESS;
}
Exemple #6
0
 bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
 {
     assert(!_getInitFlag());
     if(!_mpSeg.init(mpSegDict, userDict))
     {
         LogError("_mpSeg init");
         return false;
     }
     if(!_hmmSeg.init(hmmSegDict))
     {
         LogError("_hmmSeg init");
         return false;
     }
     LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
     return _setInitFlag(true);
 }
Exemple #7
0
        bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
        {
#ifndef NO_CODING_LOG
            if (!_getInitFlag())
            {
                LogError("not inited.");
                return false;
            }
            if (begin > end)
            {
                LogError("begin > end");
                return false;
            }
#endif
            //use hmm cut first
            vector<Unicode> hmmRes;
            if (!_hmmSeg.cut(begin, end, hmmRes))
            {
                LogError("_hmmSeg cut failed.");
                return false;
            }

            vector<Unicode> fullRes;
            for (vector<Unicode>::const_iterator hmmResItr = hmmRes.begin(); hmmResItr != hmmRes.end(); hmmResItr++)
            {
                
                // if it's too long, cut with _fullSeg, put fullRes in res
                if (hmmResItr->size() > _maxWordLen)
                {
                    if (_fullSeg.cut(hmmResItr->begin(), hmmResItr->end(), fullRes))
                    {
                       for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++)
                       {
                           res.push_back(*fullResItr);
                       }
                    }
                }
                else // just use the hmm result
                {
                    res.push_back(*hmmResItr);
                }
            }

            return true;
        }
Exemple #8
0
        bool init()
        {
#ifndef NO_CODING_LOG
            if (_getInitFlag())
            {
                LogError("inited.");
            }
#endif
            if (!_hmmSeg.init())
            {
                LogError("_hmmSeg init");
                return false;
            }
            if (!_fullSeg.init())
            {
                LogError("_fullSeg init");
                return false;
            }
            return _setInitFlag(true);
        }