Ejemplo n.º 1
0
  void TestToolBoxTrie::testString()
  {
    try
      {

		ToolBox::Trie<std::string> strTrie(std::string(""));
		strTrie.addEntry("Premiere Chaine", 15, std::string("1er"));
		strTrie.addEntry("Deuxieme Chaine", 15, std::string("2eme"));
		{
		  const std::string &s = strTrie.getEntry("unknown", 7);
		  CPPUNIT_ASSERT_EQUAL(std::string(""), s);
		}
		{
		  const std::string &s = strTrie.getEntry("test", 4);
		  CPPUNIT_ASSERT_EQUAL(std::string(""), s);
		}
		{
		  const std::string &s = strTrie.getEntry("Premiere Chaine", 15);
		  CPPUNIT_ASSERT_EQUAL(std::string("1er"), s);
		}
		{
		  const std::string &s = strTrie.getEntry("Premiere Chaine", 14);
		  CPPUNIT_ASSERT_EQUAL(std::string(""), s);
		}
		{
		  const std::string &s = strTrie.getEntry("premiere Chaine", 15);
		  CPPUNIT_ASSERT_EQUAL(std::string(""), s);
		}
		{
		  const std::string &s = strTrie.getEntry("Premiere Chaine ", 16);
		  CPPUNIT_ASSERT_EQUAL(std::string(""), s);
		}
		{
		  const std::string &s = strTrie.getEntry("Deuxieme Chaine", 15);
		  CPPUNIT_ASSERT_EQUAL(std::string("2eme"), s);
		}
	  }
    catch (const ToolBox::Exception &e)
      {
		std::cerr << e << std::endl;
		CPPUNIT_ASSERT(false);
      }
  }
Ejemplo n.º 2
0
bool CommonLanguageAnalyzer::analyze_synonym_impl(const izenelib::util::UString& inputString, SynonymOutputType& synonymOutput)
{
//    cout << "[CommonLanguageAnalyzer::analyze_synonym_impl] ";
//    inputString.displayStringValue(izenelib::util::UString::UTF_8);
//    cout << endl;

    bool retFoundSynonym = false;

    const UString::CharT* pInput = inputString.c_str();
    size_t length = inputString.length();

    // ensure length
    size_t ustring_buffer_size_ = term_ustring_buffer_limit_;
    if (ustring_buffer_size_ < length+1)
    {
        ustring_buffer_size_ = length+1;
        delete lowercase_ustring_buffer_;
        lowercase_ustring_buffer_ = new UString::CharT[ustring_buffer_size_];
    }
    size_t string_buffer_size = term_string_buffer_limit_;
    if (string_buffer_size < length*4)
    {
        string_buffer_size = length*4;
        delete lowercase_string_buffer_;
        lowercase_string_buffer_ = new char[string_buffer_size];
    }

    // to low case
    UString::CharT* lowercaseTermUstr = lowercase_ustring_buffer_;
    bool lowercaseIsDifferent = UString::toLowerString(pInput, length,
                                lowercase_ustring_buffer_, term_ustring_buffer_limit_);
    if (lowercaseIsDifferent)
        pInput = lowercaseTermUstr;

    // convert input string to utf8 characters
    char* chars = lowercase_string_buffer_;
    vector<size_t> charOffs; charOffs.reserve(length);
    size_t preCharLen = -1;
    size_t curOff = 0;
    for (size_t i = 0; i < length; i++)
    {
        curOff += (preCharLen+1);
        charOffs[i] = curOff;
        preCharLen = UString::convertString(UString::UTF_8, pInput+i, 1, chars+curOff, 4);
        //cout << chars+curOff <<" "<<curOff<<" " << preCharLen <<endl;
    }

    // search synonym dict for input
    izenelib::am::StrBasedVTrie strTrie(pSynonymContainer_->getData());
    UString::CharT * synonymResultUstr = NULL;
    size_t synonymResultUstrLen = 0;

    size_t curIdx = 0;
    size_t startIdx = 0;
    size_t wordEndIdx;
    VTrieNode endNode;
    while (curIdx < length)
    {
        wordEndIdx = size_t(-1);

        char* pch = chars+charOffs[curIdx];
        strTrie.firstSearch(pch);
        if (strTrie.completeSearch == false || strTrie.node->moreLong == false)
        {
            curIdx ++;
            continue;
        }

        if (strTrie.exists())
        {
            wordEndIdx = curIdx;
            strTrie.getCurrentNode(endNode);
        }
        //else // uncomment if minimum match
        {
            for (size_t j = curIdx + 1; j < length; j ++)
            {
                char* pnch = chars+charOffs[j];
                strTrie.search(pnch);

                if (strTrie.completeSearch == false)
                    break;

                // matched a word
                if (strTrie.exists())
                {
                    wordEndIdx = j;
                    strTrie.getCurrentNode(endNode);
                    //break; // uncomment if minimum match
                }
            }
        }

        // if matched a synonym
        if (wordEndIdx != size_t(-1))
        {
            retFoundSynonym = true;

            // segment with out synonym
            if (startIdx < curIdx)
            {
                std::vector<UString> segment;
                UString subsegment(pInput+startIdx, curIdx-startIdx);
                //subsegment.displayStringValue(izenelib::util::UString::UTF_8); cout << endl;///
                segment.push_back(subsegment);
                synonymOutput.push_back(segment);
            }

            // segment with synonyms
            std::vector<UString> segment;
            //UString(pInput+curIdx,wordEndIdx+1-curIdx).displayStringValue(izenelib::util::UString::UTF_8); cout <<" [has synonym] ";
            pSynonymContainer_->setSynonym(pSynonymResult_, &endNode);

            size_t cnt, idx = 0;
            set<UString> synonymSet; // avoid duplication
            do
            {
                cnt = pSynonymResult_->getSynonymCount(idx);
                for (size_t off = 0; off < cnt; off++)
                {
                    char * synonymResult = pSynonymResult_->getWord(idx, off);
                    if (synonymResult)
                    {
                        size_t synonymResultLen = strlen(synonymResult);
                        if (synonymResultLen <= term_ustring_buffer_limit_)
                        {
                            synonymResultUstr = synonym_ustring_buffer_;
                            synonymResultUstrLen = UString::toUcs2(synonymEncode_,
                                    synonymResult, synonymResultLen, synonym_ustring_buffer_, term_ustring_buffer_limit_);
                        }

                        UString synonym(synonymResultUstr, synonymResultUstrLen);
                        if (synonymSet.find(synonym) != synonymSet.end())
                            continue;
                        synonymSet.insert(synonym);
                        segment.push_back(synonym);
                    }
                }

                idx++;
            }
            while (cnt > 0);
            synonymOutput.push_back(segment);

            curIdx = wordEndIdx+1;
            startIdx = curIdx;
        }
        else
        {
            curIdx++;
        }
    }

    if (!retFoundSynonym)
        return false;

    if (startIdx < curIdx)
    {
        std::vector<UString> segment;
        UString subsegment(pInput+startIdx, curIdx-startIdx);
        //subsegment.displayStringValue(izenelib::util::UString::UTF_8); cout << endl; ///
        segment.push_back(subsegment);
        synonymOutput.push_back(segment);
    }

    return true;
}