Beispiel #1
0
int main(int argc, char ** argv)
{
    if(argc < 2)
    {
        cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
            <<"options:\n"
            <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
            <<"\t--dictpath\tsee example\n"
            <<"\t--modelpath\tsee example\n"
            <<"example:\n"
            <<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8\n"
            <<"\t"<<argv[0]<<" testlines.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutHMM\n"
            <<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutMix\n"
            <<endl;
        
        return EXIT_FAILURE;
    }
    ArgvContext arg(argc, argv);
    string dictPath = arg["--dictpath"];
    string modelPath = arg["--modelpath"];
    string algorithm = arg["--algorithm"];

    if("cutHMM" == algorithm)
    {
        HMMSegment seg;
        if(!seg.init(modelPath.c_str()))
        {
            cout<<"seg init failed."<<endl;
            return EXIT_FAILURE;
        }
        cut(&seg, arg[1].c_str());
        seg.dispose();
    }
    else if("cutMix" == algorithm)
    {
        MixSegment seg;
        if(!seg.init(dictPath.c_str(), modelPath.c_str()))
        {
            cout<<"seg init failed."<<endl;
            return EXIT_FAILURE;
        }
        cut(&seg, arg[1].c_str());
        seg.dispose();
    }
    else
    {
        MPSegment seg;
        if(!seg.init(dictPath.c_str()))
        {
            cout<<"seg init failed."<<endl;
            return false;
        }
        cut(&seg, arg[1].c_str());
        seg.dispose();
    }
    return EXIT_SUCCESS;
}
Beispiel #2
0
 void init(
     const string& dictPath, 
     const string& hmmFilePath,
     const string& userDictPath = ""
 )
 {
     LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath));
     _dictTrie = _segment.getDictTrie();
     LIMONP_CHECK(_dictTrie);
 };
Beispiel #3
0
            bool tag(const string& src, vector<pair<string, string> >& res) const
            {
                vector<string> cutRes;
                if (!_segment.cut(src, cutRes))
                {
                  Rcout<<"_mixSegment cut failed"<<std::endl;
      
                    return false;
                }

                const DictUnit *tmp = NULL;
                Unicode unico;
                for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
                {
                    if (!TransCode::decode(*itr, unico))
                    {
                      Rcout<<"decode failed."<<std::endl;
        
                        return false;
                    }
                    tmp = _dictTrie->find(unico.begin(), unico.end());
                    if(tmp == NULL || tmp->tag.empty())
                    {
                      res.push_back(make_pair(*itr, _specialRule(unico)));
                    }
                    else
                    {
                      res.push_back(make_pair(*itr, tmp->tag));
                    }                }
                return !res.empty();
            }
 bool init(const string &dictPath, const string &hmmFilePath, const string &idfPath, const string &stopWordPath)
 {
     _loadIdfDict(idfPath);
     _loadStopWordDict(stopWordPath);
     LIMONP_CHECK(_segment.init(dictPath, hmmFilePath));
     return true;
 };
Beispiel #5
0
            bool tag(const string& src, vector<pair<string, string> >& res)
            {
                assert(_getInitFlag());
                vector<string> cutRes;
                if (!_segment.cut(src, cutRes))
                {
                    LogError("_mixSegment cut failed");
                    return false;
                }

                const DictUnit *tmp = NULL;
                Unicode unico;
                for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
                {
                    if (!TransCode::decode(*itr, unico))
                    {
                        LogError("decode failed.");
                        return false;
                    }
                    tmp = _dictTrie.find(unico.begin(), unico.end());
                    res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag));
                }
                tmp = NULL;
                return !res.empty();
            }
Beispiel #6
0
 bool init(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string& transProb)
 {
     
     assert(!_getInitFlag());
     _dictTrie.init(dictPath);
     assert(_dictTrie);
     return _setInitFlag(_segment.init(dictPath, hmmFilePath));
 };
Beispiel #7
0
            bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const
            {
                vector<string> words;
                if(!_segment.cut(str, words))
                {
                    LogError("segment cut(%s) failed.", str.c_str());
                    return false;
                }

                // filtering single word.
                for(vector<string>::iterator iter = words.begin(); iter != words.end(); )
                {
                    if(_isSingleWord(*iter))
                    {
                        iter = words.erase(iter);
                    }
                    else
                    {
                        iter++;
                    }
                }

                map<string, double> wordmap;
                for(size_t i = 0; i < words.size(); i ++)
                {
                    wordmap[ words[i] ] += 1.0;
                }

                for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
                {
                    if(_stopWords.end() != _stopWords.find(itr->first))
                    {
                        wordmap.erase(itr++);
                        continue;
                    }

                    unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
                    if(cit != _idfMap.end())
                    {
                        itr->second *= cit->second;
                    }
                    else
                    {
                        itr->second *= _idfAverage;
                    }
                    itr ++;
                }

                keywords.clear();
                std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
                topN = MIN(topN, keywords.size());
                partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp);
                keywords.resize(topN);
                return true;
            }
Beispiel #8
0
 int cut_c(const char* text, char **&result)
 {
     vector<string> words;
     segment.cut(text, words);
     int word_count = words.size();
     result = (char**)(malloc(sizeof(char*)*word_count));
     int i = 0;
     for (vector<string>::const_iterator j = words.begin(); j != words.end(); j++)
     {
         result[i++] = strdup((*j).c_str());
     }
     return word_count;
 }
Beispiel #9
0
int main(int argc, char ** argv)
{
    //demo
    {
        HMMSegment seg;
        if(!seg.init("../dicts/hmm_model.utf8"))
        {
            cout<<"seg init failed."<<endl;
            return EXIT_FAILURE;
        }
        cut(&seg, "testlines.utf8");
        seg.dispose();
    }
    {
        MixSegment seg;
        if(!seg.init("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8"))
        {
            cout<<"seg init failed."<<endl;
            return EXIT_FAILURE;
        }
        cut(&seg, "testlines.utf8");
		cout<<"Cut type"<<endl;
		cut_type(&seg, "testlines.utf8");
		cout<<endl;
        seg.dispose();
    }
    {
        MPSegment seg;
        if(!seg.init("../dicts/jieba.dict.utf8"))
        {
            cout<<"seg init failed."<<endl;
            return false;
        }
        cut(&seg, "testlines.utf8");
        seg.dispose();
    }
    return EXIT_SUCCESS;
}
Beispiel #10
0
        bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
        {
            assert(_getInitFlag());
            if (begin >= end)
            {
                LogError("begin >= end");
                return false;
            }

            //use mix cut first
            vector<Unicode> mixRes;
            if (!_mixSeg.cut(begin, end, mixRes))
            {
                LogError("_mixSeg cut failed.");
                return false;
            }

            vector<Unicode> fullRes;
            for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++)
            {
                
                // if it's too long, cut with _fullSeg, put fullRes in res
                if (mixResItr->size() > _maxWordLen)
                {
                    if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes))
                    {
                       for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++)
                       {
                           res.push_back(*fullResItr);
                       }

                       //clear tmp res
                       fullRes.clear();
                    }
                }
                else // just use the mix result
                {
                    res.push_back(*mixResItr);
                }
            }

            return true;
        }
Beispiel #11
0
 bool init(const string& dict, const string& model, size_t maxWordLen)
 {
     if (_getInitFlag())
     {
         LogError("inited already.");
         return false;
     }
     if (!_mixSeg.init(dict, model))
     {
         LogError("_mixSeg init");
         return false;
     }
     if (!_fullSeg.init(dict))
     {
         LogError("_fullSeg init");
         return false;
     }
     _maxWordLen = maxWordLen;
     return _setInitFlag(true);
 }
Beispiel #12
0
 bool init_c(const char * dict_path, const char * model_path)
 {
     return segment.init(dict_path, model_path);
 }
Beispiel #13
0
 bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
 {
     _loadIdfDict(idfPath);
     _loadStopWordDict(stopWordPath);
     return _setInitFlag(_segment.init(dictPath, hmmFilePath));
 };