Exemplo n.º 1
0
CEString CT_LexPreprocessor::sInsertStress(int iLetter, CEString s_)
{
    if (iLetter >= 0 && iLetter < (int)(s_.uiLength()))
    {
        return s_.sSubstr(0, iLetter) + L"<" + s_.sSubstr(iLetter, s_.uiLength() - iLetter);
    }
    return s_;
}
Exemplo n.º 2
0
int CT_LexPreprocessor::iDeleteStress(CEString& s_)
// Deletes the stress and returns the number of the letter it follows
{
    unsigned int uiStressPos = s_.uiFind(L"<");
    if (uiStressPos >= s_.uiLength())
    {
        return -1;
    }
    s_ = s_.sSubstr(0, uiStressPos) 
        + s_.sSubstr(uiStressPos + 1, s_.uiLength() - uiStressPos - 1);
    return (int)uiStressPos;
}
Exemplo n.º 3
0
int CAnalyzer::iCheckEndings(vector<CHasher>& vecPossibleWordforms,
                             vector<stStemLinks>& vecStems,
                             CEString sLeft,
                             CEString sRight,
                             int iStressPosEnding)
// If pvec_stems_id IS NOT empty:
// For every stem in pvec_stems_id, take the corresponding endings table
// and look whether it contains an ending equal to sRight;
// for every such ending, add a wordform to vecPossibleWordforms.
//
// If pvec_stems_id IS empty:
// Look for an ending equal to sRight; for every such ending,
// build a wordform and store it in vecPossibleWordforms.
// (Identical wordforms are stored as one wordform.)
{
    if (m_pDb == NULL) // || vecStems == NULL)
    {
        return -1;
    }
    static vector<int> vecGram;
    CEString str_query, sLemma;
    vector<CEString> vecLemma;

    for (vector<stStemLinks>::iterator itStems = vecStems.begin();
        itStems != vecStems.end(); itStems++)
    {
        // For each *itStems look up the endings table ID in DB, then in this table try to find
        // endings which are equal to sRight. For each ending found, write the parameters
        // to tmpWf and then push_back tmpWf to vecPossibleWordforms:
        vecGram.clear();
        vecGram = arr_freq_endings[(*itStems).iEndingsLink].m_vecFind (sRight, iStressPosEnding);
        if (vecGram.empty())
        {
            continue;
        }
        for (vector<int>::iterator iter_endings = vecGram.begin();
            iter_endings != vecGram.end(); iter_endings++)
        {
            CHasher tmpWf;
            tmpWf.hDecodeHash(*iter_endings);
            tmpWf.m_llLexemeId = (*itStems).llLexemeId;
            tmpWf.m_sLemma = (*itStems).sLemma;
            //tmpWf.str_WordForm = sLeft + sRight;
            //h_AddClassifyingCategories(&tmpWf);
            vecPossibleWordforms.push_back (tmpWf);
        }
        vecLemma.clear(); // that vector is different for every stem found
    }

    if (vecStems.empty())
    // Try to guess the lexeme
    {
        if (sLeft.uiLength() <= 2)
        {
            return 0;
        }
        vector<int> vec_i_possible_ETs;
//        pair<unordered_multimap<wstring, int>::iterator,
//             unordered_multimap<wstring, int>::iterator> pair_search_result = umap_endings2subtbl.equal_range((wstring)sRight);
        pair<multimap<CEString, int>::iterator,
             multimap<CEString, int>::iterator> pair_search_result = umap_endings2subtbl.equal_range(sRight);
        for (; pair_search_result.first != pair_search_result.second; ++pair_search_result.first)
        {
            vec_i_possible_ETs.push_back(pair_search_result.first->second);
        }
        for (vector<int>::iterator iter_ET = vec_i_possible_ETs.begin();
             iter_ET != vec_i_possible_ETs.end();
             ++iter_ET)
        {
            if (arr_freq_endings[*iter_ET].m_sStemFinal.uiLength() > 0 &&
//                !regex_match(sLeft, (const wregex)(L"^.*(" + arr_freq_endings[*iter_ET].m_sStemFinale + L")$")))
                !sLeft.bRegexMatch (L"^.*(" + arr_freq_endings[*iter_ET].m_sStemFinal + L")$"))
            {
                continue;
            }
            if (sLeft.uiLength() <= arr_freq_endings[*iter_ET].m_iCutRight)
            {
                continue;
            }
            vecGram.clear();
            vecGram = arr_freq_endings[*iter_ET].m_vecFind(sRight, -2);
            if (vecGram.empty())
            {
                continue;
            }
            for (vector<int>::iterator itHash = vecGram.begin();
                 itHash != vecGram.end(); ++itHash)
            {
                CHasher tmpWf;
                tmpWf.m_sLemma = sLeft.sSubstr(0, sLeft.uiLength() - arr_freq_endings[*iter_ET].m_iCutRight) + arr_freq_endings[*iter_ET].m_sLemmaFinal;
                if (!bIsValidLemma (tmpWf.m_sLemma))
                {
                    continue;
                }
                // Check if what we've found is a new wordform
                bool bExists = false;
                for (vector<CHasher>::iterator itWf = vecPossibleWordforms.begin();
                     itWf != vecPossibleWordforms.end();
                     ++itWf)
                {
                    if ((*itWf).m_sLemma == tmpWf.m_sLemma && (*itWf).iGramHash() == *itHash)
                    {
                        bExists = true;
                    }
                }
                if (!bExists)
                {
                    tmpWf.hDecodeHash(*itHash);
                    tmpWf.m_llLexemeId = 0;
                    vecPossibleWordforms.push_back(tmpWf);
                }
            }
        }
    }
    vecGram.clear();
    return 0;
}
Exemplo n.º 4
0
int CAnalyzer::iAnalyze(CEString sWordform,
                        vector<CHasher>& vecPossibleWordforms,
                        BOOL bGuess)
{
    // Be careful: it changes the input
    if (sWordform.uiLength() <= 0 || m_pDb == NULL) // || vecPossibleWordforms == NULL)
    {
        return -1;
    }

// Multiple stress marks??

    CEString sWordformOriginal(sWordform);
    int iStressPosStem = -1, iStressPosEnding = -1;
    int iStressPos = sWordform.uiFindOneOf(0, L"<\u0301");
    if (ecNotFound == iStressPos)
    {
        iStressPos = -1;
    }
    else
    {
        sWordform.sErase(iStressPos, 1);
    }

/*
    wsmatch result;
    bool b_match = regex_match(wstring(sWordform), result, (const wregex)L"^([^<\u0301]*)([<\u0301])(.*)$");
    if (b_match == true)
    {
        CEString sLeft = (CEString)result[1];
        CEString str_delimiter = (CEString)result[2];
        CEString sRight = (CEString)result[3];
        sWordform = sLeft + sRight;
        if (str_delimiter[0] == L'<')       // кор<ова
        {
            iStressPos = sLeft.length();
        }
        else                                // коро\u0301ва
        {
            iStressPos = sLeft.length() - 1;
        }
    }
    else
    {
        iStressPos = -1;
    }
*/

    wsmatch result;
    bool bMatch = sWordform.bRegexMatch(L"^([^<\u0301]*)([<\u0301])(.*)$");
    if (bMatch == true)
    {
        CEString sLeft = sWordform.sGetRegexMatch(0);
        CEString sDelimiter = sWordform.sGetRegexMatch(1);
        CEString sRight = sWordform.sGetRegexMatch(2);
        sWordform = sLeft + sRight;
        if (sDelimiter[0] == L'<')       // кор<ова
        {
            iStressPos = sLeft.uiLength();
        }
        else                                // коро\u0301ва
        {
            iStressPos = sLeft.uiLength() - 1;
        }
    }
    else
    {
        iStressPos = -1;
    }

    CEString sLeft, sRight;
    vector<stStemLinks> vecStems;
    vecPossibleWordforms.clear();
    for (int iLeft = sWordform.uiLength(); iLeft >= 0; --iLeft)
    {
        sLeft = sWordform.sSubstr(0, iLeft);
        sRight = sWordform.sSubstr(iLeft, sWordform.uiLength() - iLeft);

        // Stress positions for the stem and the ending
        if (iStressPos == -1)
        {
            iStressPosStem = iStressPosEnding = -2;
        }
        else if (iStressPos >= sLeft.uiLength())
        {
            iStressPosStem = -1;
            iStressPosEnding = iStressPos - sLeft.uiLength();
        }
        else
        {
            iStressPosStem = iStressPos;
            iStressPosEnding = -1;
        }
        
        vecStems.clear();
        iLookUpStems(vecStems, sLeft, iStressPosStem);
        if (vecStems.empty())
        {
            continue;
        }
        iCheckEndings(vecPossibleWordforms, vecStems, sLeft, sRight, iStressPosEnding);
    }

    // If we have no result, try cutting of possible prefixes
    if (vecPossibleWordforms.empty())
    {
        for (int iLeft = min(sWordformOriginal.uiLength(), 4); iLeft >= 1; --iLeft)
        {
            sLeft = sWordformOriginal.sSubstr(0, iLeft);
            sRight = sWordformOriginal.sSubstr(iLeft, sWordformOriginal.uiLength() - iLeft);
            if (iLeft == 4)
            {
                if (sLeft == L"пол-")
                {
                    int iResult = iAnalyze(sRight, vecPossibleWordforms, bGuess);
                    if (iResult > 0)
                    {
                        for (int iWf = iResult - 1; iWf >= 0; --iWf)
                        {
                            if (vecPossibleWordforms[iWf].m_ePos != POS_NOUN ||
                                vecPossibleWordforms[iWf].m_eNumber != NUM_SG ||
                                vecPossibleWordforms[iWf].m_eCase != CASE_GEN)
                            {
                                vecPossibleWordforms.erase(vecPossibleWordforms.begin() + iWf);
                            }
                        }
                        if (vecPossibleWordforms.size() > 0)
                        {
                            return vecPossibleWordforms.size();
                        }
                    }
                }
            }
            else if (iLeft == 3)
            {
                if (sLeft == L"пол")
                {
                    int iResult = iAnalyze(sRight, vecPossibleWordforms, bGuess);
                    if (iResult > 0)
                    {
                        for (int iWf = iResult - 1; iWf >= 0; --iWf)
                        {
                            if (vecPossibleWordforms[iWf].m_ePos != POS_NOUN ||
                                vecPossibleWordforms[iWf].m_eNumber != NUM_SG ||
                                vecPossibleWordforms[iWf].m_eCase != CASE_GEN)
                            {
                                vecPossibleWordforms.erase(vecPossibleWordforms.begin() + iWf);
                            }
                        }
                        if (vecPossibleWordforms.size() > 0)
                        {
                            return vecPossibleWordforms.size();
                        }
                    }
                }
            }
            else if (iLeft == 2)
            {
                if (sLeft == L"не")
                {
                    int iResult = iAnalyze(sRight, vecPossibleWordforms, bGuess);
                    if (iResult > 0)
                    {
                        for (int iWf = iResult - 1; iWf >= 0; --iWf)
                        {
                            if (vecPossibleWordforms[iWf].m_ePos != POS_NOUN ||
                                vecPossibleWordforms[iWf].m_ePos != POS_ADJ ||
                                vecPossibleWordforms[iWf].m_ePos != POS_VERB ||
                                (vecPossibleWordforms[iWf].m_ePos == POS_VERB &&
                                 (vecPossibleWordforms[iWf].m_eSubparadigm == SUBPARADIGM_PRESENT_TENSE ||
                                  vecPossibleWordforms[iWf].m_eSubparadigm == SUBPARADIGM_PAST_TENSE ||
                                  vecPossibleWordforms[iWf].m_eSubparadigm == SUBPARADIGM_INFINITIVE)))
                            {
                                vecPossibleWordforms.erase(vecPossibleWordforms.begin() + iWf);
                            }
                        }
                        if (vecPossibleWordforms.size() > 0)
                        {
                            return vecPossibleWordforms.size();
                        }
                    }
                }
            }
        }
    }
    // Now, if we haven't found anything, we may guess the lexeme
    if (vecPossibleWordforms.empty() && bGuess == TRUE)
    {
        for (int iLeft = 0; iLeft <= sWordform.uiLength(); ++iLeft)
        {
            sLeft = sWordform.sSubstr (0, iLeft);
            sRight = sWordform.sSubstr (iLeft, sWordform.uiLength() - iLeft);

            // Stress positions for the stem and the ending
            if (iStressPos == -1)
            {
                iStressPosStem = iStressPosEnding = -2;
            }
            else if (iStressPos >= sLeft.uiLength())
            {
                iStressPosStem = -1;
                iStressPosEnding = iStressPos - sLeft.uiLength();
            }
            else
            {
                iStressPosStem = iStressPos;
                iStressPosEnding = -1;
            }
            vecStems.clear();
            iCheckEndings (vecPossibleWordforms, vecStems, sLeft, sRight, iStressPosEnding);
            if ((bContainsPlausibleVariants (vecPossibleWordforms) && sRight.uiLength() <= 3) ||
                vecPossibleWordforms.size() >= 4)
            {
                break;
            }
        }
        if (vecPossibleWordforms.size() > 4)
        {
            LeaveMostPlausible (vecPossibleWordforms);
        }
    }
    return vecPossibleWordforms.size();
}
Exemplo n.º 5
0
int _tmain(int argc, _TCHAR* argv[])
{
    wstring sReplaceableB (L"0123456789012345");
    wstring sReplacedB = sReplaceableB.replace (5, 6, L"abcd");

    CEString sReplacableC(L"0123456789012345678901234567890123567890");
    CEString sReplacedCC = sReplacableC.sReplace(L"567890", L"abcd");
    CEString sReplacedC = sReplacableC.sReplace(5, 6, L"abcd");

    sReplaceableB = L"0123456789";
    sReplacedB = sReplaceableB.replace (5, 3, L"a");

    sReplaceableB = L"0123456789";
    sReplacedB = sReplaceableB.replace (8, 2, L"ab");

    sReplaceableB = L"0123456789";
    sReplacedB = sReplaceableB.replace (5, 3, L"a");

    sReplaceableB = L"0123456789";
    sReplacedB = sReplaceableB.replace (8, 2, L"abc");

    wstring sEraseableB (L"0123456789");
    wstring sErasedB = sEraseableB.erase (5, 3);

    sEraseableB = L"0123456789";
    sErasedB = sEraseableB.erase (5, 5);

    sEraseableB = L"0123456789";
    sErasedB = sEraseableB.erase (5, 7);

    sEraseableB = L"0123456789";
    sErasedB = sEraseableB.erase (5);

    sEraseableB = L"0123456789";
    sErasedB = sEraseableB.erase();

    try
    {
        sEraseableB = L"0123456789";
        sErasedB = sEraseableB.erase (12, 7);
    }
    catch (...)
    {
    }

    try
    {
        sEraseableB = L"0123456789";
        sErasedB = sEraseableB.erase (12);
    }
    catch (...)
    {
    }

    // Ctors
    CEString sEmptyString;
    if (0 != sEmptyString.uiLength() || 0 != sEmptyString.uiGetNumOfTokens() || 
        0 != sEmptyString.uiGetNumOfFields() || 0 != sEmptyString.uiGetVisibleLength())
    {
        ERROR_LOG (L"Initialization error");
    }


    CEString sCopy (sEmptyString);
    if (0 != sCopy.uiLength() || 0 != sCopy.uiGetNumOfTokens() || 
        0 != sCopy.uiGetNumOfFields() || 0 != sCopy.uiGetVisibleLength())
    {
        ERROR_LOG (L"Initialization error");
    }

    sCopy = L"0123456789";
    CEString sCopy2 (sCopy);

    sCopy2.SetBreakChars (L" -/");
    CEString sCopy3 (sCopy2);

    CEString sFromCString (L"0123456789");
    if (10 != sFromCString.uiLength() || 1 != sFromCString.uiGetNumOfTokens() || 
        1 != sFromCString.uiGetNumOfFields() || 10 != sFromCString.uiGetVisibleLength())
    {
        ERROR_LOG (L"Initialization error");
    }

// TODO: operator ()

    CEString sSquareBracketsTest (L"0123456789");
    sSquareBracketsTest[1] = L'a';
    if (sSquareBracketsTest != L"0a23456789")
    {
        ERROR_LOG (L"Square brackets operator error");
    }

    sSquareBracketsTest = L"0123456789";
    sSquareBracketsTest[0] = L'a';
    if (sSquareBracketsTest != L"a123456789")
    {
        ERROR_LOG (L"Square brackets operator error");
    }

    sSquareBracketsTest = L"0123456789";
    sSquareBracketsTest[9] = L'a';
    if (sSquareBracketsTest != L"012345678a")
    {
        ERROR_LOG (L"Square brackets operator error");
    }

    sSquareBracketsTest = L"0123456789";
    CEString sLetter = sSquareBracketsTest[1];
    if (L"1" != sLetter)
    {
        ERROR_LOG (L"Square brackets operator error");
    }

    // Comparison
    ERelation eRet = CEString::eCompare (L"1234567", L"1234567");
    if (ecEqual != eRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    eRet = CEString::eCompare (L"1234567", L"1234566");
    if (eRet != ecGreater)
    {
        ERROR_LOG (L"Comparison error");
    }

    eRet = CEString::eCompare (L"1234566", L"1234567");
    if (eRet != ecLess)
    {
        ERROR_LOG (L"Comparison error");
    }

    eRet = CEString::eCompare (L"123456", L"1234567");
    if (eRet != ecLess)
    {
        ERROR_LOG (L"Comparison error");
    }

    eRet = CEString::eCompare (L"1234567", L"123456");
    if (eRet != ecGreater)
    {
        ERROR_LOG (L"Comparison error");
    }

    eRet = CEString::eCompareNoCase (L"AbCdEfG", L"ABCDEFg");
    if (ecEqual != eRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    eRet = CEString::eCompareNoCase (L"АбВгДЕ", L"АБВГДе");
    if (ecEqual != eRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    bool bRet = CEString::bIn (L'2', L"0123456789");
    if (!bRet)
    {
        ERROR_LOG (L"bIn() failed.");
    }

    bRet = CEString::bIn (L'a', L"0123456789");
    if (bRet)
    {
        ERROR_LOG (L"bIn() failed.");
    }

    CEString sSearcheable (L"0123456789");
    unsigned int uiFindRet = sSearcheable.uiFind (L"123");
    if (1 != uiFindRet)
    {
        ERROR_LOG (L"uiFind() failed.");
    }

    uiFindRet = sSearcheable.uiFind (L"abc");
    if (ecNotFound != uiFindRet)
    {
        ERROR_LOG (L"uiFind() failed.");
    }

    sSearcheable = L"aBcDeFgHiJ";
    uiFindRet = sSearcheable.uiFindNoCase (L"bCDEF");
    if (ecNotFound == uiFindRet)
    {
        ERROR_LOG (L"uiFindNoCase() failed.");
    }

    sSearcheable = L"012345543210";
    uiFindRet = sSearcheable.uiRFind (L"5");
    if (6 != uiFindRet)
    {
        ERROR_LOG (L"uiRFind() failed.");
    }

//    unsigned int uiRFindNoCase (const wchar_t * szRhs) const

    sSearcheable = L"0123456789";
    uiFindRet = sSearcheable.uiFindFirstOf (L"234");
    if (2 != uiFindRet)
    {
        ERROR_LOG (L"uiFindFirstOf() failed.");
    }

//    unsigned int uiFindFirstOfNoCase (const wchar_t * szSet) const
    sSearcheable = L"0120120123456789";
    uiFindRet = sSearcheable.uiFindOneOf (3, L"234");
    if (5 != uiFindRet)
    {
        ERROR_LOG (L"uiFindOneOf() failed.");
    }

    sSearcheable = L"0123456789";
    uiFindRet = sSearcheable.uiFindLastOf (L"234");
    if (4 != uiFindRet)
    {
        ERROR_LOG (L"uiFindLastOf() failed.");
    }

//    unsigned int uiFindLastOfNoCase (const wchar_t * szSet) const
    bRet = sSearcheable.bStartsWith (L"012");
    if (!bRet)
    {
        ERROR_LOG (L"bStartsWith() failed.");
    }

    bRet = sSearcheable.bStartsWith (L"234");
    if (bRet)
    {
        ERROR_LOG (L"bStartsWith() failed.");
    }

    sSearcheable = L"aBcDeFgHiJ";
    bRet = sSearcheable.bStartsWithNoCase (L"abcd");
    if (!bRet)
    {
        ERROR_LOG (L"bStartsWithNoCase() failed.");
    }

    sSearcheable = L"0123456789";
    bRet = sSearcheable.bStartsWithOneOf (L"012");
    if (!bRet)
    {
        ERROR_LOG (L"bStartsWithOneOf() failed.");
    }

    bRet = sSearcheable.bStartsWithOneOf (L"123");
    if (bRet)
    {
        ERROR_LOG (L"bStartsWithOneOf() failed.");
    }

    sSearcheable = L"aBcDeFgHiJ";
    bRet = sSearcheable.bStartsWithOneOfNoCase (L"abc");
    if (!bRet)
    {
        ERROR_LOG (L"bStartsWithOneOf() failed.");
    }

    bRet = sSearcheable.bStartsWithOneOfNoCase (L"bc");
    if (bRet)
    {
        ERROR_LOG (L"bStartsWithOneOf() failed.");
    }

    sSearcheable = L"аБвГдЕёжзи";
    bRet = sSearcheable.bStartsWithOneOfNoCase (L"абв");
    if (!bRet)
    {
        ERROR_LOG (L"bStartsWithOneOf() failed.");
    }

    bRet = sSearcheable.bStartsWithOneOfNoCase (L"бв");
    if (bRet)
    {
        ERROR_LOG (L"bStartsWithOneOf() failed.");
    }

    sSearcheable = L"0123456789";
    bRet = sSearcheable.bEndsWith (L"789");
    if (!bRet)
    {
        ERROR_LOG (L"bStartsWith() failed.");
    }

    bRet = sSearcheable.bEndsWith (L"123");
    if (bRet)
    {
        ERROR_LOG (L"bStartsWith() failed.");
    }

    sSearcheable = L"abcdeFgHiJ";
    bRet = sSearcheable.bEndsWithNoCase (L"hij");
    if (!bRet)
    {
        ERROR_LOG (L"bEndsWithNoCase() failed.");
    }

    bRet = sSearcheable.bEndsWithNoCase (L"ghi");
    if (bRet)
    {
        ERROR_LOG (L"bEndsWithNoCase() failed.");
    }

    sSearcheable = L"абвгдЕёЖзИ";
    bRet = sSearcheable.bEndsWithNoCase (L"жзи");
    if (!bRet)
    {
        ERROR_LOG (L"bEndsWithNoCase() failed.");
    }

    bRet = sSearcheable.bEndsWithNoCase (L"ёжз");
    if (bRet)
    {
        ERROR_LOG (L"bEndsWithNoCase() failed.");
    }

    sSearcheable = L"0123456789";
    bRet = sSearcheable.bEndsWithOneOf (L"ab9");
    if (!bRet)
    {
        ERROR_LOG (L"bEndsWithOneOf() failed.");
    }

    bRet = sSearcheable.bEndsWithOneOf (L"ab8");
    if (bRet)
    {
        ERROR_LOG (L"bEndsWithOneOf() failed.");
    }

    sSearcheable = L"aBcDeFgHiJ";
    bRet = sSearcheable.bEndsWithOneOfNoCase (L"abj");
    if (!bRet)
    {
        ERROR_LOG (L"bEndsWithOneOfNoCase failed.");
    }

    bRet = sSearcheable.bEndsWithOneOfNoCase (L"abc");
    if (bRet)
    {
        ERROR_LOG (L"bEndsWithOneOfNoCase failed.");
    }

    sSearcheable = L"абвгдЕёЖзИ";
    bRet = sSearcheable.bEndsWithOneOfNoCase (L"abи");
    if (!bRet)
    {
        ERROR_LOG (L"bEndsWithOneOfNoCase failed.");
    }

    bRet = sSearcheable.bEndsWithOneOfNoCase (L"abc");
    if (bRet)
    {
        ERROR_LOG (L"bEndsWithOneOfNoCase failed.");
    }

    // Operators
    CEString sLhs (L"01234");
    CEString sRhs (L"56789");
    
    bRet = (sLhs == sRhs);
    if (bRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    bRet = (sLhs == L"01234");
    if (!bRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    bRet = (L"01234" == sLhs);
    if (!bRet)
    {
        ERROR_LOG (L"Comparison error");
    }

//    CString csLhs (L"01234");
//    CString csRhs (L"56789");
//    bRet = (L"01234" == csLhs);
//    if (!bRet)
//    {
//        ERROR_LOG (L"CString behavior does not match CEString behavior");
//    }

    bRet = (sLhs < sRhs);
    if (!bRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    bRet = (sLhs > sRhs);
    if (bRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    bRet = (sLhs <= sRhs);
    if (!bRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    bRet = (sLhs >= sRhs);
    if (bRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    bRet = (sLhs >= sLhs);
    if (!bRet)
    {
        ERROR_LOG (L"Comparison error");
    }


    sEmptyString = L"0123456";
    if (sEmptyString != L"0123456")
    {
        ERROR_LOG (L"Assignemnt or comparison error");
    }

    sEmptyString = sRhs;
    if (sEmptyString != sRhs)
    {
        ERROR_LOG (L"Assignemnt or comparison error");
    }

    //CEString sResult = sLhs + sRhs;
    //if (sResult != L"0123456789")
    //{
    //    ERROR_LOG (L"Concatenation or comparison error");
    //}
    //sResult += L"<--Concatenated";
    //if (sResult != L"0123456789<--Concatenated")
    //{
    //    ERROR_LOG (L"Concatenation or comparison error");
    //}

    CEString sInsertable (L"0123789");
    CEString sInserted = sInsertable.sInsert (4, L"456");
    if (sInserted != sInsertable || sInsertable != L"0123456789")
    {
        ERROR_LOG (L"Insertion error");
    }

    sInsertable = L"012456789";
    sInserted = sInsertable.sInsert (3, L'3');
    if (sInserted != sInsertable || sInsertable != L"0123456789")
    {
        ERROR_LOG (L"Insertion error");
    }

    CEString sErasable (L"012abcd3456789");
    CEString sErased = sErasable.sErase (3, 4);
    if (sErased != sErasable || sErasable != L"0123456789")
    {
        ERROR_LOG (L"Erase error");
    }
    
    sErasable = L"0123456789";
    sErased = sErasable.sErase (3, 7);
    if (sErased != sErasable || sErasable != L"012")
    {
        ERROR_LOG (L"Erase error");
    }

    sErasable = L"0123456789";
    sErased = sErasable.sErase (3, 40);
    if (sErased != sErasable || sErasable != L"012")
    {
        ERROR_LOG (L"Erase error");
    }

    sErasable = L"0123456789";
    sErased = sErasable.sErase (3);
    if (sErased != sErasable || sErasable != L"012")
    {
        ERROR_LOG (L"Erase error");
    }

    sErasable = L"0123456789a";
    sErased = sErasable.sErase (10);
    if (sErased != sErasable || sErasable != L"0123456789")
    {
        ERROR_LOG (L"Erase error");
    }

    sErasable.Erase();
    if (!sErasable.bIsEmpty() || sErasable.uiLength() != 0)
    {
        ERROR_LOG (L"Erase error");
    }

    sErasable = L"0123456789";

    CEString sConvertToUppercase(L"aAbBcC");
    sConvertToUppercase.ToUpper();
    if (sConvertToUppercase != L"AABBCC")
    {
        ERROR_LOG(L"ToUpper error");
    }

    sConvertToUppercase = CEString::sToUpper(L"aAbBcC");
    if (sConvertToUppercase != L"AABBCC")
    {
        ERROR_LOG(L"ToUpper error");
    }

    CEString sConvertToUppercaseCyr(L"aABbcCаАбБвВ");
    sConvertToUppercaseCyr.ToUpper();
    if (sConvertToUppercaseCyr != L"AABBCCААББВВ")
    {
        ERROR_LOG(L"ToUpper error for Cyrillic");
    }

    sConvertToUppercaseCyr = CEString::sToUpper(L"aAbBcCаАбБвВ");
    if (sConvertToUppercaseCyr != L"AABBCCААББВВ")
    {
        ERROR_LOG(L"sToUpper error for Cyrillic");
    }

    CEString sConvertToLowercase(L"aABbcC");
    sConvertToLowercase.ToLower();
    if (sConvertToLowercase != L"aabbcc")
    {
        ERROR_LOG(L"ToLower error");
    }

    sConvertToLowercase = CEString::sToLower(L"aAbBcC");
    if (sConvertToLowercase != L"aabbcc")
    {
        ERROR_LOG(L"ToLower error");
    }

    CEString sConvertToLowercaseCyr(L"aABbcCаАбБвВ");
    sConvertToLowercaseCyr.ToLower();
    if (sConvertToLowercaseCyr != L"aabbccааббвв")
    {
        ERROR_LOG(L"ToLower error for Cyrillic");
    }

    sConvertToLowercaseCyr = CEString::sToLower(L"aAbBcCаАбБвВ");
    if (sConvertToLowercaseCyr != L"aabbccааббвв")
    {
        ERROR_LOG(L"sToLower error for Cyrillic");
    }

    CEString sFromAscii = CEString::sToString("abcdefgxyzABCDEFGXYZ01234567890.,!");
    if (sFromAscii != L"abcdefgxyzABCDEFGXYZ01234567890.,!")
    {
        ERROR_LOG(L"sToString error for ascii conversion");
    }

    CEString sReplaceable(L"01abcd6789");
    CEString sReplaced = sReplaceable.sReplace (2, L"2345");
    if (sReplaced != sReplaceable || sReplaceable != L"0123456789")
    {
        ERROR_LOG (L"Replace error");
    }

    sReplaceable = L"0123456abc";
    sReplaced = sReplaceable.sReplace (7, L"789");
    if (sReplaced != sReplaceable || sReplaceable != L"0123456789")
    {
        ERROR_LOG (L"Replace error");
    }

    sReplaceable = L"0123456a89";
    sReplaced = sReplaceable.sReplace (7, L'7');
    if (sReplaced != sReplaceable || sReplaceable != L"0123456789")
    {
        ERROR_LOG (L"Replace error");
    }

    sReplaceable = L"012345678a";
    sReplaced = sReplaceable.sReplace (9, L'9');
    if (sReplaced != sReplaceable || sReplaceable != L"0123456789")
    {
        ERROR_LOG (L"Replace error");
    }


    sReplaceable = L"01234abc89";
    sReplaced = sReplaceable.sReplace (5, 3, L"567");
    if (sReplaced != sReplaceable || sReplaceable != L"0123456789")
    {
        ERROR_LOG (L"Replace error");
    }

    sReplaceable = L"01234aaa6789";
    sReplaced = sReplaceable.sReplace (5, 3, L"5");
    if (sReplaced != sReplaceable || sReplaceable != L"0123456789")
    {
        ERROR_LOG (L"Replace error");
    }

    sReplaceable = L"01234567ab";
    sErased = sReplaceable.sReplace (8, 2, L"89");
    if (sReplaced != sReplaceable || sReplaceable != L"0123456789")
    {
        ERROR_LOG (L"Replace error");
    }

    sReplaceable = L"01234567a9";
    sReplaced = sReplaceable.sReplace (8, 2, L"8");
    if (sReplaced != sReplaceable || sReplaceable != L"012345678")
    {
        ERROR_LOG (L"Replace error");
    }

    sReplaceable = L"01234567ab";
    sReplaced = sReplaceable.sReplace (8, 2, L"890");
    if (sReplaced != sReplaceable || sReplaceable != L"01234567890")
    {
        ERROR_LOG (L"Replace error");
    }

    sReplaceable = L"0ё2345ё78ёё";
    sReplaceable.Replace (0, 10, L'ё', L'е');
    if (sReplaceable != L"0е2345е78ее")
    {
        ERROR_LOG (L"Replace error");
    }


    CEString sTrimmable (L"     01234     ");
    sTrimmable.TrimLeft();
    if (sTrimmable != L"01234     ")
    {
        ERROR_LOG (L"Trim or comparison error");
    }

    sTrimmable.TrimRight();
    if (sTrimmable != L"01234")
    {
        ERROR_LOG (L"Trim or comparison error");
    }

    sTrimmable = L"     01234     ";
    sTrimmable.Trim();
    if (sTrimmable != L"01234")
    {
        ERROR_LOG (L"Trim or comparison error");
    }

    sTrimmable = L"=&=&=01234&&&==";
    sTrimmable.TrimLeft (L"=&");
    if (sTrimmable != L"01234&&&==")
    {
        ERROR_LOG (L"Trim or comparison error");
    }

    sTrimmable.Trim (L"=&");
    if (sTrimmable != L"01234")
    {
        ERROR_LOG (L"Trim or comparison error");
    }

    sTrimmable = L"=&=&=01234&&&==";
    sTrimmable.Trim (L"=&");
    if (sTrimmable != L"01234")
    {
        ERROR_LOG (L"Trim or comparison error");
    }

    CEString sReversable (L"0123456789");
    sReversable.Reverse();
    if (sReversable != L"9876543210")
    {
        ERROR_LOG (L"Reversing error");
    }

    CEString sWhole (L"0123456789");
    CEString sSubstr = sWhole.sSubstr (1, 3);
    if (sSubstr != L"123")
    {
        ERROR_LOG (L"Trim or comparison error");
    }

    sSubstr = sWhole.sSubstr (7);
    if (sSubstr != L"789")
    {
        ERROR_LOG (L"Trim or comparison error");
    }

    CEString sFields (L"123 456 789");
    sFields.SetBreakChars (L" ");
    CEString sField = sFields.sGetField (1);
    if (sField != L"456")
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    StToken stToken = sFields.stGetField (0);
    if (3 != stToken.uiLength || 0 != stToken.uiOffset || ecTokenText != stToken.eType)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    stToken = sFields.stGetField (0, ecTokenSpace);
    if (1 != stToken.uiLength || 3 != stToken.uiOffset || ecTokenSpace != stToken.eType)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    try
    {
//        stToken = sFields.stGetField (99);
//        ERROR_LOG (L"Tokenizer or comparison error");   // Exception expected
    }
    catch (CException& ex)
    {
//        ::MessageBox (NULL, ex.sGetDescription().c_str(), L"Kai Exception", MB_ICONWARNING);
    }

//    ST_Token st_GetFieldFromOffset (int i_offset,
//                                    et_TokenType eo_type = ec_TokenText);

    stToken = sFields.stGetTokenFromOffset (6);
    if (3 != stToken.uiLength || 4 != stToken.uiOffset || ecTokenText != stToken.eType)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    ETokenType eType = sFields.eGetTokenType (1);
//    et_TokenType eo_GetTokenType (int i_offset, int i_at);
    if (ecTokenBreakChars != eType)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    stToken = sFields.stGetToken (1);
    if (1 != stToken.uiLength || 3 != stToken.uiOffset || ecTokenSpace != stToken.eType)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    const StToken& rstToken = sFields.rstGetToken (1);
    if (1 != stToken.uiLength || 3 != stToken.uiOffset || ecTokenSpace != stToken.eType)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    CEString sToken = sFields.sGetToken (1);
    if (sToken != L" ")
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    try
    {
        CEString sToken1 = sFields.sGetToken(999);
        if (sToken1 != L" ")
        {
            ERROR_LOG(L"Tokenizer or comparison error");
        }
    }
    catch (CException ex)
    {
        CEString sMsg(L"Exception: ");
        sMsg += ex.szGetDescription();
        ERROR_LOG(sMsg);
    }

    bool b_ = sFields.bGetNextToken(stToken);
    if (!b_ || ecTokenText != stToken.eType || 4 != stToken.uiOffset || 3 != stToken.uiLength)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    b_ = sFields.bGetPrevToken (stToken);
    if (!b_ || ecTokenBreakChars != stToken.eType || 3 != stToken.uiOffset || 1 != stToken.uiLength)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    unsigned int uiTokenNum = sFields.uiGetTokenNum (stToken);
    if (1 != uiTokenNum)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    unsigned int uiFields = sFields.uiGetNumOfFields();
    if (3 != uiFields)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    uiFields = sFields.uiGetNumOfFields (ecTokenSpace);
    if (2 != uiFields)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

//    uiFields = sFields.uiGetNumOfFields (3, 6);

    uiFields = sFields.uiNFields();
    if (3 != uiFields)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

//    uiFields = sFields.uiNFields (3, 6); 

    unsigned int uiTokens = sFields.uiGetNumOfTokens();
    if (5 != uiTokens)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    uiTokens = sFields.uiNTokens();
    if (5 != uiTokens)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    unsigned int uiVLength = sFields.uiGetVisibleLength();
    if (11 != uiVLength)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    unsigned int uiFLength = sFields.uiGetFieldLength (1);
    if (3 != uiFLength)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

//    CEString s (L"abcdefg");
//    wchar_t * szData = (wchar_t*)s;

    CEString sSyllables (L"бавогузюы");
    sSyllables.SetVowels (L"аеёиоуыэюя");
    unsigned int uiSyllables = sSyllables.uiGetNumOfSyllables();
    if (5 != uiSyllables)
    {
        ERROR_LOG (L"Syllable count error");
    }

    uiSyllables = sSyllables.uiNSyllables();
    if (5 != uiSyllables)
    {
        ERROR_LOG (L"Syllable count error");
    }


    // Vowels & consonants
    unsigned int uiVowelPos = sSyllables.uiGetVowelPos (3);
    if (7 != uiVowelPos)
    {
        ERROR_LOG (L"Vowel position error");
    }
    
    unsigned int uiSyllPos = sSyllables.uiGetSyllableFromVowelPos (7);
    if (3 != uiSyllPos)
    {
        ERROR_LOG (L"Syllable position error");
    }

    {
        CEString sConvert = CEString::sToString (9999999999999);
        if (L"9999999999999" != sConvert)
        {
            ERROR_LOG(L"Large int conversion error");
        }
        int i_ = 999999;
        sConvert = CEString::sToString (i_);
        if (L"999999" != sConvert)
        {
            ERROR_LOG(L"Int conversion error");
        }
    }

    {
        CEString sConvert = CEString::sToString(999999999.9999);
        double d_ = 999999.999;
        sConvert = CEString::sToString(d_);
    }

    //
    // Done!
    //
    CLogger::pGetInstance()->Flush();

_CrtDumpMemoryLeaks();


}
Exemplo n.º 6
0
int CT_LexPreprocessor::iClassifyStems()
// For every endings subtable, looks for the stems usable with it and
// stores up to NUM_SFX their longest common suffixes in the database
{
    if (m_pDb == NULL)
    {
        return -1;
    }
    const int MIN_NUMBER_OF_STEMS = 70;
    const int NUM_SFX = 5;
    const int MAX_NUM_SFX = 24;

    CEString sQuery, sStem;
    CEString *arr_sStems;
    CEString **parr_sSfx;
    vector<CEString> vecStems;
    int iLastSubtable = 0, iStem;

    iLastSubtable = m_pDb->iLastID(L"endings_meta");
    for (int iSubtable = 0; iSubtable <= iLastSubtable; ++iSubtable)
    {
        vecStems.clear();
        CEString sFirstLemma = L"";
        int iCutRight = 0;
        CEString sLemmaEnding = L"";

        vLongStemsBySubtable(iSubtable, 2, vecStems, sFirstLemma);
        if (vecStems.size() < MIN_NUMBER_OF_STEMS)
        {
            continue;
        }

        // Find the longest common prefix of the first stem and the corresponding lemma
        CEString* arr_sStemAndLemma;
        CEString** parr_sPfx;
        arr_sStemAndLemma = new CEString[2];
        arr_sStemAndLemma[0] = vecStems[0];
        arr_sStemAndLemma[1] = sFirstLemma;
        parr_sPfx = new CEString*;
        *parr_sPfx = new CEString[1];
        int iPfx = iLCP(arr_sStemAndLemma, parr_sPfx, 2, 1);
        if (iPfx <= 0)
        {
            continue;
        }
        CEString sCommonPfx = (*parr_sPfx)[0];
        iCutRight = vecStems[0].uiLength() - sCommonPfx.uiLength();
        if (iCutRight >= 4)
        {
            continue;
        }
        sLemmaEnding = sFirstLemma.sSubstr(sCommonPfx.uiLength(), sFirstLemma.uiLength() - sCommonPfx.uiLength());

        // Find longest common suffixes of the stems found
        iStem = 0;
        arr_sStems = new CEString[vecStems.size()];
        parr_sSfx = new CEString*;
        *parr_sSfx = new CEString[1];
        for (vector<CEString>::iterator iterStems = vecStems.begin();
            iterStems != vecStems.end();
            ++iterStems, ++iStem)
        {
            // We reverse the stem so that i_LCP could find suffixes
            // instead of prefixes
//            reverse((*iter_stems).begin(), (*iter_stems).end());
            (*iterStems).Reverse();
            arr_sStems[iStem] = *iterStems;
        }

        // several attemps
        int iSfx = 0;
        int iMaxSfx = NUM_SFX;
        while (iSfx <= 0 && iMaxSfx <= MAX_NUM_SFX)
        {
            delete[] *parr_sSfx;
            delete parr_sSfx;
            parr_sSfx = new CEString*;
            *parr_sSfx = new CEString[1];
            iSfx = iLCP(arr_sStems, parr_sSfx, vecStems.size(), iMaxSfx);
            if (iSfx == 1 && (*parr_sSfx)[0].uiLength() <= 0)
            {
                iSfx = 0;
            }
            iMaxSfx += 2;
        }
        vInsertCommonSfx(parr_sSfx, iSfx, iSubtable, vecStems.size(), iCutRight, sLemmaEnding);

        delete[] arr_sStems;
        delete[] *parr_sSfx;
        delete parr_sSfx;

        // TEST
        //if (i_subtable > 100)
        //{
        //    break;
        //}
    }
    return 0;
}