Example #1
0
int MP_test( SegmentWrapper *seg )
{
    Darts::DoubleArray::key_type    sentence[1024];
    string                          s_sentence;
    MPSegment                       mpSeg;
    vector<STWordInfo>              words2;

    mpSeg.init( pDict );

    while (std::cin.getline(sentence, sizeof(sentence)))
    {
        s_sentence = sentence;
        if ( s_sentence.size() < 1 )
            continue;


        // cut the sentent to several words
        seg->cut2( s_sentence, words2 );


        std::cout << "\tword num:" << words2.size() << endl;
        for ( size_t i=0; i<words2.size(); ++i)
            words2[i].print();
        std::cout << "-----------"<< std::endl;

        words2.clear();
    }

    return 0;
}
Example #2
0
int main(int argc, char ** argv)
{
    if(argc < 2)
    {
        cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
            <<"options:\n"
            <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
            <<"\t--dictpath\tsee example\n"
            <<"\t--modelpath\tsee example\n"
            <<"example:\n"
            <<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8\n"
            <<"\t"<<argv[0]<<" testlines.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutHMM\n"
            <<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutMix\n"
            <<endl;
        
        return EXIT_FAILURE;
    }
    ArgvContext arg(argc, argv);
    string dictPath = arg["--dictpath"];
    string modelPath = arg["--modelpath"];
    string algorithm = arg["--algorithm"];

    if("cutHMM" == algorithm)
    {
        HMMSegment seg;
        if(!seg.init(modelPath.c_str()))
        {
            cout<<"seg init failed."<<endl;
            return EXIT_FAILURE;
        }
        cut(&seg, arg[1].c_str());
        seg.dispose();
    }
    else if("cutMix" == algorithm)
    {
        MixSegment seg;
        if(!seg.init(dictPath.c_str(), modelPath.c_str()))
        {
            cout<<"seg init failed."<<endl;
            return EXIT_FAILURE;
        }
        cut(&seg, arg[1].c_str());
        seg.dispose();
    }
    else
    {
        MPSegment seg;
        if(!seg.init(dictPath.c_str()))
        {
            cout<<"seg init failed."<<endl;
            return false;
        }
        cut(&seg, arg[1].c_str());
        seg.dispose();
    }
    return EXIT_SUCCESS;
}
Example #3
0
            virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
            {
                vector<Unicode> words;
                words.reserve(end - begin);
                if(!_mpSeg.cut(begin, end, words))
                {
                  Rcout<<"mpSeg cutDAG failed."<<std::endl;
                
                    return false;
                }

                vector<Unicode> hmmRes;
                hmmRes.reserve(end - begin);
                Unicode piece;
                piece.reserve(end - begin);
                for (size_t i = 0, j = 0; i < words.size(); i++)
                {
                    //if mp get a word, it's ok, put it into result
                    if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0])))
                    {
                        res.push_back(words[i]);
                        continue;
                    }

                    // if mp get a single one and it is not in userdict, collect it in sequence
                    j = i;
                    while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0]))
                    {
                        piece.push_back(words[j][0]);
                        j++;
                    }

                    // cut the sequence with hmm
                    if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes))
                    {
                      Rcout<<"_hmmSeg cut failed."<<std::endl;
            
                        return false;
                    }

                    //put hmm result to result
                    for (size_t k = 0; k < hmmRes.size(); k++)
                    {
                        res.push_back(hmmRes[k]);
                    }

                    //clear tmp vars
                    piece.clear();
                    hmmRes.clear();

                    //let i jump over this piece
                    i = j - 1;
                }
                return true;
            }
Example #4
0
 bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
 {
     LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
     LIMONP_CHECK(_hmmSeg.init(hmmSegDict));
     // LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
     return true;
 }
Example #5
0
            bool extract(const string& str, vector<pair<string, double> >& keywords, uint topN) const
            {
                vector<string> words;
                if(!_segment.cut(str, words))
                {
                    LogError("segment cut(%s) failed.", str.c_str());
                    return false;
                }

                unordered_map<string, double> wordmap;
                for(uint i = 0; i < words.size(); i ++)
                {
                    wordmap[ words[i] ] += 1.0;
                }

                for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end();)
                {
                    unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
                    if(cit != _idfMap.end())
                    {
                        itr->second *= cit->second;
                        itr ++;
                    }
                    else
                    {
                        itr = wordmap.erase(itr);
                    }
                }

                keywords.resize(MIN(topN, wordmap.size()));
                partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp);
                return true;
            }
Example #6
0
 bool init(const string& dictPath, const string& idfPath)
 {
     ifstream ifs(idfPath.c_str());
     if(!ifs)
     {
         LogError("open %s failed.", idfPath.c_str());
         return false;
     }
     string line ;
     vector<string> buf;
     for(uint lineno = 0; getline(ifs, line); lineno++)
     {
         buf.clear();
         if(line.empty())
         {
             LogError("line[%d] empty. skipped.", lineno);
             continue;
         }
         if(!split(line, buf, " ") || buf.size() != 2)
         {
             LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
             continue;
         }
         _idfMap[buf[0]] = atof(buf[1].c_str());
     }
     return _setInitFlag(_segment.init(dictPath));
 };
Example #7
0
int main(int argc, char ** argv)
{
    //demo
    {
        HMMSegment seg;
        if(!seg.init("../dicts/hmm_model.utf8"))
        {
            cout<<"seg init failed."<<endl;
            return EXIT_FAILURE;
        }
        cut(&seg, "testlines.utf8");
        seg.dispose();
    }
    {
        MixSegment seg;
        if(!seg.init("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8"))
        {
            cout<<"seg init failed."<<endl;
            return EXIT_FAILURE;
        }
        cut(&seg, "testlines.utf8");
		cout<<"Cut type"<<endl;
		cut_type(&seg, "testlines.utf8");
		cout<<endl;
        seg.dispose();
    }
    {
        MPSegment seg;
        if(!seg.init("../dicts/jieba.dict.utf8"))
        {
            cout<<"seg init failed."<<endl;
            return false;
        }
        cut(&seg, "testlines.utf8");
        seg.dispose();
    }
    return EXIT_SUCCESS;
}
Example #8
0
 bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
 {
     assert(!_getInitFlag());
     if(!_mpSeg.init(mpSegDict, userDict))
     {
         LogError("_mpSeg init");
         return false;
     }
     if(!_hmmSeg.init(hmmSegDict))
     {
         LogError("_hmmSeg init");
         return false;
     }
     LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
     return _setInitFlag(true);
 }
Example #9
0
 const DictTrie* getDictTrie() const 
 {
     return _mpSeg.getDictTrie();
 }