Пример #1
0
            bool tag(const string& src, vector<pair<string, string> >& res) const
            {
                vector<string> cutRes;
                if (!_segment.cut(src, cutRes))
                {
                  Rcout<<"_mixSegment cut failed"<<std::endl;
      
                    return false;
                }

                const DictUnit *tmp = NULL;
                Unicode unico;
                for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
                {
                    if (!TransCode::decode(*itr, unico))
                    {
                      Rcout<<"decode failed."<<std::endl;
        
                        return false;
                    }
                    tmp = _dictTrie->find(unico.begin(), unico.end());
                    if(tmp == NULL || tmp->tag.empty())
                    {
                      res.push_back(make_pair(*itr, _specialRule(unico)));
                    }
                    else
                    {
                      res.push_back(make_pair(*itr, tmp->tag));
                    }                }
                return !res.empty();
            }
Пример #2
0
            bool tag(const string& src, vector<pair<string, string> >& res)
            {
                assert(_getInitFlag());
                vector<string> cutRes;
                if (!_segment.cut(src, cutRes))
                {
                    LogError("_mixSegment cut failed");
                    return false;
                }

                const DictUnit *tmp = NULL;
                Unicode unico;
                for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
                {
                    if (!TransCode::decode(*itr, unico))
                    {
                        LogError("decode failed.");
                        return false;
                    }
                    tmp = _dictTrie.find(unico.begin(), unico.end());
                    res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag));
                }
                tmp = NULL;
                return !res.empty();
            }
Пример #3
0
            bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const
            {
                vector<string> words;
                if(!_segment.cut(str, words))
                {
                    LogError("segment cut(%s) failed.", str.c_str());
                    return false;
                }

                // filtering single word.
                for(vector<string>::iterator iter = words.begin(); iter != words.end(); )
                {
                    if(_isSingleWord(*iter))
                    {
                        iter = words.erase(iter);
                    }
                    else
                    {
                        iter++;
                    }
                }

                map<string, double> wordmap;
                for(size_t i = 0; i < words.size(); i ++)
                {
                    wordmap[ words[i] ] += 1.0;
                }

                for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
                {
                    if(_stopWords.end() != _stopWords.find(itr->first))
                    {
                        wordmap.erase(itr++);
                        continue;
                    }

                    unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
                    if(cit != _idfMap.end())
                    {
                        itr->second *= cit->second;
                    }
                    else
                    {
                        itr->second *= _idfAverage;
                    }
                    itr ++;
                }

                keywords.clear();
                std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
                topN = MIN(topN, keywords.size());
                partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp);
                keywords.resize(topN);
                return true;
            }
Пример #4
0
 int cut_c(const char* text, char **&result)
 {
     vector<string> words;
     segment.cut(text, words);
     int word_count = words.size();
     result = (char**)(malloc(sizeof(char*)*word_count));
     int i = 0;
     for (vector<string>::const_iterator j = words.begin(); j != words.end(); j++)
     {
         result[i++] = strdup((*j).c_str());
     }
     return word_count;
 }
Пример #5
0
        bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
        {
            assert(_getInitFlag());
            if (begin >= end)
            {
                LogError("begin >= end");
                return false;
            }

            //use mix cut first
            vector<Unicode> mixRes;
            if (!_mixSeg.cut(begin, end, mixRes))
            {
                LogError("_mixSeg cut failed.");
                return false;
            }

            vector<Unicode> fullRes;
            for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++)
            {
                
                // if it's too long, cut with _fullSeg, put fullRes in res
                if (mixResItr->size() > _maxWordLen)
                {
                    if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes))
                    {
                       for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++)
                       {
                           res.push_back(*fullResItr);
                       }

                       //clear tmp res
                       fullRes.clear();
                    }
                }
                else // just use the mix result
                {
                    res.push_back(*mixResItr);
                }
            }

            return true;
        }