コード例 #1
0
  virtual bool cut(const string& str, vector<vector<string> >& vres) const {
    vres.clear();

    Unicode unicode;
    unicode.reserve(str.size());

    TransCode::decode(str, unicode);

    Unicode::const_iterator left = unicode.begin();
    Unicode::const_iterator right;

    for(right = unicode.begin(); right != unicode.end(); right++) {
      if(isIn(specialSymbols_, *right)) {
        if(left != right) {
          cut(left, right, vres);
        }
		for (vector<vector<string> >::iterator itr = vres.begin(); itr != vres.end(); ++itr) {
			itr->resize(itr->size() + 1);
			TransCode::encode(right, right + 1, itr->back());
		}
        left = right + 1;
      }
    }
    if(left != right) {
      cut(left, right, vres);
    }

    return true;
  }
コード例 #2
0
ファイル: trie_test.cpp プロジェクト: arrack/cppjieba
TEST(DictTrieTest, Test1) {
  string s1, s2;
  DictTrie trie(DICT_FILE);
  ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001);
  string word("来到");
  Unicode uni;
  ASSERT_TRUE(TransCode::Decode(word, uni));
  DictUnit nodeInfo;
  nodeInfo.word = uni;
  nodeInfo.tag = "v";
  nodeInfo.weight = -8.87033;
  s1 << nodeInfo;
  s2 << (*trie.Find(uni.begin(), uni.end()));

  EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
  word = "清华大学";
  LocalVector<pair<size_t, const DictUnit*> > res;
  const char * words[] = {"清", "清华", "清华大学"};
  for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
    ASSERT_TRUE(TransCode::Decode(words[i], uni));
    res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end())));
    //resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end());
  }
  vector<pair<size_t, const DictUnit*> > vec;
  vector<struct Dag> dags;
  ASSERT_TRUE(TransCode::Decode(word, uni));
  trie.Find(uni.begin(), uni.end(), dags);
  ASSERT_EQ(dags.size(), uni.size());
  ASSERT_NE(dags.size(), 0u);
  s1 << res;
  s2 << dags[0].nexts;
  ASSERT_EQ(s1, s2);
  
}
コード例 #3
0
ファイル: PosTagger.hpp プロジェクト: Axure/nodejieba
            bool tag(const string& src, vector<pair<string, string> >& res)
            {
                assert(_getInitFlag());
                vector<string> cutRes;
                if (!_segment.cut(src, cutRes))
                {
                    LogError("_mixSegment cut failed");
                    return false;
                }

                const DictUnit *tmp = NULL;
                Unicode unico;
                for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
                {
                    if (!TransCode::decode(*itr, unico))
                    {
                        LogError("decode failed.");
                        return false;
                    }
                    tmp = _dictTrie.find(unico.begin(), unico.end());
                    res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag));
                }
                tmp = NULL;
                return !res.empty();
            }
コード例 #4
0
ファイル: PosTagger.hpp プロジェクト: hoyoung2015/jiebaR
            bool tag(const string& src, vector<pair<string, string> >& res) const
            {
                vector<string> cutRes;
                if (!_segment.cut(src, cutRes))
                {
                  Rcout<<"_mixSegment cut failed"<<std::endl;
      
                    return false;
                }

                const DictUnit *tmp = NULL;
                Unicode unico;
                for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
                {
                    if (!TransCode::decode(*itr, unico))
                    {
                      Rcout<<"decode failed."<<std::endl;
        
                        return false;
                    }
                    tmp = _dictTrie->find(unico.begin(), unico.end());
                    if(tmp == NULL || tmp->tag.empty())
                    {
                      res.push_back(make_pair(*itr, _specialRule(unico)));
                    }
                    else
                    {
                      res.push_back(make_pair(*itr, tmp->tag));
                    }                }
                return !res.empty();
            }
コード例 #5
0
ファイル: PosTagger.hpp プロジェクト: AllanXiang/cppjieba
  bool tag(const string& src, vector<pair<string, string> >& res) const {
    vector<string> cutRes;
    if (!segment_.cut(src, cutRes)) {
      LogError("mixSegment_ cut failed");
      return false;
    }

    const DictUnit *tmp = NULL;
    Unicode unico;
    const DictTrie * dict = segment_.getDictTrie();
    assert(dict != NULL);
    for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) {
      if (!TransCode::decode(*itr, unico)) {
        LogError("decode failed.");
        return false;
      }
      tmp = dict->find(unico.begin(), unico.end());
      if(tmp == NULL || tmp->tag.empty()) {
        res.push_back(make_pair(*itr, specialRule_(unico)));
      } else {
        res.push_back(make_pair(*itr, tmp->tag));
      }
    }
    return !res.empty();
  }
コード例 #6
0
ファイル: TTrie.cpp プロジェクト: songcheng/cppjieba
TEST(DictTrieTest, automation) {
    DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
    //string word = "yasherhs";
    string word = "abcderf";
    Unicode unicode;
    ASSERT_TRUE(TransCode::decode(word, unicode));
    vector<struct SegmentChar> res;
    trie.find(unicode.begin(), unicode.end(), res);
}
コード例 #7
0
ファイル: MixSegment.hpp プロジェクト: hoyoung2015/jiebaR
            virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
            {
                vector<Unicode> words;
                words.reserve(end - begin);
                if(!_mpSeg.cut(begin, end, words))
                {
                  Rcout<<"mpSeg cutDAG failed."<<std::endl;
                
                    return false;
                }

                vector<Unicode> hmmRes;
                hmmRes.reserve(end - begin);
                Unicode piece;
                piece.reserve(end - begin);
                for (size_t i = 0, j = 0; i < words.size(); i++)
                {
                    //if mp get a word, it's ok, put it into result
                    if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0])))
                    {
                        res.push_back(words[i]);
                        continue;
                    }

                    // if mp get a single one and it is not in userdict, collect it in sequence
                    j = i;
                    while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0]))
                    {
                        piece.push_back(words[j][0]);
                        j++;
                    }

                    // cut the sequence with hmm
                    if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes))
                    {
                      Rcout<<"_hmmSeg cut failed."<<std::endl;
            
                        return false;
                    }

                    //put hmm result to result
                    for (size_t k = 0; k < hmmRes.size(); k++)
                    {
                        res.push_back(hmmRes[k]);
                    }

                    //clear tmp vars
                    piece.clear();
                    hmmRes.clear();

                    //let i jump over this piece
                    i = j - 1;
                }
                return true;
            }
コード例 #8
0
ファイル: trie_test.cpp プロジェクト: arrack/cppjieba
TEST(DictTrieTest, UserDictWithMaxWeight) {
  DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax);
  string word = "云计算";
  Unicode unicode;
  ASSERT_TRUE(TransCode::Decode(word, unicode));
  const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
  ASSERT_TRUE(unit);
  string res ;
  res << *unit;
  ASSERT_EQ("[\"20113\", \"35745\", \"31639\"]  -2.975", res);
}
コード例 #9
0
            virtual bool cut(const string& str, vector<string>& res)const
            {
                assert(_getInitFlag());

                Unicode unicode;
                TransCode::decode(str, unicode);
                res.clear();
                
                Unicode::const_iterator left = unicode.begin();
                Unicode::const_iterator right = unicode.begin();
                
                string oneword;
                while(right != unicode.end())
                {
                    if(isIn(_specialSymbols, *right))
                    {
                        if(left != right)
                        {
                            cut(left, right, res);
                        }
                        TransCode::encode(right, right + 1, oneword);
                        res.push_back(oneword);
                        right ++;
                        left = right;
                    }
                    else
                    {
                        right ++;
                    }
                }
                if(left != right)
                {
                    cut(left, right, res);
                }
                
                return true;
            }
コード例 #10
0
ファイル: Trie.hpp プロジェクト: samevers/posTrunk
  void InsertNode(const Unicode& key, const DictUnit* ptValue) {
    if (key.begin() == key.end()) {
      return;
    }

    TrieNode::NextMap::const_iterator kmIter;
    Unicode::const_iterator citer= key.begin();
    TrieNode *ptNode = _base + (*(citer++));
    for (; citer != key.end(); citer++) {
      if (NULL == ptNode->next) {
        ptNode->next = new TrieNode::NextMap;
      }
      kmIter = ptNode->next->find(*citer);
      if (ptNode->next->end() == kmIter) {
        TrieNode *nextNode = new TrieNode;

        (*(ptNode->next))[*citer] = nextNode;
        ptNode = nextNode;
      } else {
        ptNode = kmIter->second;
      }
    }
    ptNode->ptValue = ptValue;
  }
コード例 #11
0
ファイル: MPSegment.hpp プロジェクト: Sandy4321/chinese_nlp
 bool cut(const string& sentence, 
       vector<string>& words, 
       size_t max_word_len) const {
   Unicode unicode;
   if (!TransCode::decode(sentence, unicode)) {
     return false;
   }
   vector<Unicode> unicodeWords;
   cut(unicode.begin(), unicode.end(), 
         unicodeWords, max_word_len);
   words.resize(unicodeWords.size());
   for (size_t i = 0; i < words.size(); i++) {
     TransCode::encode(unicodeWords[i], words[i]);
   }
   return true;
 }
コード例 #12
0
ファイル: MPSegment.hpp プロジェクト: fc13240/cppjieba
            bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos)const
            {
                if(!_getInitFlag())
                {
                    LogError("not inited.");
                    return false;
                }
                if(str.empty())
                {
                    return false;
                }
                Unicode sentence;

                if(!TransCode::decode(str, sentence))
                {
                    LogError("TransCode::decode failed.");
                    return false;
                }
                return cut(sentence.begin(), sentence.end(), segWordInfos);

            }
コード例 #13
0
ファイル: PosTagger.hpp プロジェクト: 1271281914/simhash
  bool Tag(const string& src, vector<pair<string, string> >& res) const {
    vector<string> CutRes;
    segment_.Cut(src, CutRes);

    const DictUnit *tmp = NULL;
    Unicode unico;
    const DictTrie * dict = segment_.GetDictTrie();
    assert(dict != NULL);
    for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
      if (!TransCode::Decode(*itr, unico)) {
        LOG(ERROR) << "Decode failed.";
        return false;
      }
      tmp = dict->Find(unico.begin(), unico.end());
      if (tmp == NULL || tmp->tag.empty()) {
        res.push_back(make_pair(*itr, SpecialRule(unico)));
      } else {
        res.push_back(make_pair(*itr, tmp->tag));
      }
    }
    return !res.empty();
  }
コード例 #14
0
ファイル: TransCode.hpp プロジェクト: gisupc/cppjieba
 inline bool encode(const Unicode& uni, string& res)
 {
     return encode(uni.begin(), uni.end(), res);
 }
コード例 #15
0
inline void Encode(const Unicode& uni, string& res) {
  Encode(uni.begin(), uni.end(), res);
}
コード例 #16
0
inline string Encode(const Unicode& unicode) {
  return Encode(unicode.begin(), unicode.end());
}
コード例 #17
0
ファイル: trie_test.cpp プロジェクト: arrack/cppjieba
TEST(DictTrieTest, Dag) {
  DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");

  {
    string word = "清华大学";
    Unicode unicode;
    ASSERT_TRUE(TransCode::Decode(word, unicode));
    vector<struct Dag> res;
    trie.Find(unicode.begin(), unicode.end(), res);

    size_t nexts_sizes[] = {3, 2, 2, 1};
    ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
    for (size_t i = 0; i < res.size(); i++) {
      ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
    }
  }

  {
    string word = "北京邮电大学";
    Unicode unicode;
    ASSERT_TRUE(TransCode::Decode(word, unicode));
    vector<struct Dag> res;
    trie.Find(unicode.begin(), unicode.end(), res);

    size_t nexts_sizes[] = {3, 1, 2, 2, 2, 1};
    ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
    for (size_t i = 0; i < res.size(); i++) {
      ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
    }
  }

  {
    string word = "长江大桥";
    Unicode unicode;
    ASSERT_TRUE(TransCode::Decode(word, unicode));
    vector<struct Dag> res;
    trie.Find(unicode.begin(), unicode.end(), res);

    size_t nexts_sizes[] = {3, 1, 2, 1};
    ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
    for (size_t i = 0; i < res.size(); i++) {
      ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
    }
  }

  {
    string word = "长江大桥";
    Unicode unicode;
    ASSERT_TRUE(TransCode::Decode(word, unicode));
    vector<struct Dag> res;
    trie.Find(unicode.begin(), unicode.end(), res, 3);

    size_t nexts_sizes[] = {2, 1, 2, 1};
    ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
    for (size_t i = 0; i < res.size(); i++) {
      ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
    }
  }

  {
    string word = "长江大桥";
    Unicode unicode;
    ASSERT_TRUE(TransCode::Decode(word, unicode));
    vector<struct Dag> res;
    trie.Find(unicode.begin(), unicode.end(), res, 4);

    size_t nexts_sizes[] = {3, 1, 2, 1};
    ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
    for (size_t i = 0; i < res.size(); i++) {
      ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
    }
  }
}
コード例 #18
0
    bool MixSegment::cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
    {
        if(!_getInitFlag())
        {
            LogError("not inited.");
            return false;
        }
		if(begin == end)
		{
			return false;
		}
        vector<TrieNodeInfo> infos;
        if(!_mpSeg.cut(begin, end, infos))
        {
            LogError("mpSeg cutDAG failed.");
            return false;
        }
        Unicode unico;
        vector<Unicode> hmmRes;
        string tmp;
        for(uint i= 0; i < infos.size(); i++)
        {
            TransCode::encode(infos[i].word,tmp);
            if(1 == infos[i].word.size())
            {
                unico.push_back(infos[i].word[0]);
            }
            else
            {
                if(!unico.empty())
                {
                    hmmRes.clear();
                    if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes))
                    {
                        LogError("_hmmSeg cut failed.");
                        return false;
                    }
                    for(uint j = 0; j < hmmRes.size(); j++)
                    {
                        TransCode::encode(hmmRes[j], tmp);
                        res.push_back(tmp);
                    }
                }
                unico.clear();
                TransCode::encode(infos[i].word, tmp);
                res.push_back(tmp);
            }
        }
        if(!unico.empty())
        {
            hmmRes.clear();
            if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes))
            {
                LogError("_hmmSeg cut failed.");
                return false;
            }
            for(uint j = 0; j < hmmRes.size(); j++)
            {
                TransCode::encode(hmmRes[j], tmp);
                res.push_back(tmp);
            }
        }
        
        return true;
    }