Ejemplo n.º 1
0
CEString CT_LexPreprocessor::sInsertStress(int iLetter, CEString s_)
{
    if (iLetter >= 0 && iLetter < (int)(s_.uiLength()))
    {
        return s_.sSubstr(0, iLetter) + L"<" + s_.sSubstr(iLetter, s_.uiLength() - iLetter);
    }
    return s_;
}
Ejemplo n.º 2
0
bool CAnalyzer::bIsValidLemma(CEString sWf)
{
//    if (!regex_search(sWf, (const wregex)(L"[аеёиоуыэюяАЕЁИОУЫЭЮЯ]")))
    if (!sWf.bRegexSearch (L"[аеёиоуыэюяАЕЁИОУЫЭЮЯ]"))
    {
        return false;
    }
//    if (regex_search(sWf, (const wregex)(L"[аеёиоуыэюяъь][ьъ]")))
    if (sWf.bRegexSearch (L"[аеёиоуыэюяъь][ьъ]"))
    {
        return false;
    }
//    if (regex_search(sWf, (const wregex)(L"ъ[аоуыэи]")))
    if (sWf.bRegexSearch (L"ъ[аоуыэи]"))
    {
        return false;
    }
//    if (regex_search(sWf, (const wregex)(L"[аоэуе][аоэуы]ть$")))
    if (sWf.bRegexSearch (L"[аоэуе][аоэуы]ть$"))
    {
        return false;
    }
//    if (regex_search(sWf, (const wregex)(L"[кгхц]ь$")))
    if (sWf.bRegexSearch (L"[кгхц]ь$"))
    {
        return false;
    }
//    if (regex_search(sWf, (const wregex)(L"[кгх]ый$")))
    if (sWf.bRegexSearch (L"[кгх]ый$"))
    {
        return false;
    }
//    if (regex_search(sWf, (const wregex)(L"[жчшщ]ы")))
    if (sWf.bRegexSearch (L"[жчшщ]ы"))
    {
        return false;
    }
//    if (regex_search(sWf, (const wregex)(L"ы$")))
    if (sWf.bRegexSearch (L"ы$"))
    {
        return false;
    }
//    if (regex_search(sWf, (const wregex)(L"[бвгджзклмнпрстфхцчшщ](й|ъ$)")))
    if (sWf.bRegexSearch (L"[бвгджзклмнпрстфхцчшщ](й|ъ$)"))
    {
        return false;
    }
//    if (regex_search(sWf, (const wregex)(L"[бвгджзклмнпрстфхцчшщ]{4}$")))
    if (sWf.bRegexSearch (L"[бвгджзклмнпрстфхцчшщ]{4}$"))
    {
        return false;
    }
    return true;
}
Ejemplo n.º 3
0
int CT_LexPreprocessor::iDeleteStress(CEString& s_)
// Deletes the stress and returns the number of the letter it follows
{
    unsigned int uiStressPos = s_.uiFind(L"<");
    if (uiStressPos >= s_.uiLength())
    {
        return -1;
    }
    s_ = s_.sSubstr(0, uiStressPos) 
        + s_.sSubstr(uiStressPos + 1, s_.uiLength() - uiStressPos - 1);
    return (int)uiStressPos;
}
Ejemplo n.º 4
0
ET_ReturnCode CLexeme::eGetAlternatingPreverb (const CEString& sVerbForm, CEString& sPreverb, bool& bVoicing)
{
    if (!m_stProperties.bFleetingVowel)
    {
        return H_NO_MORE;
    }

// types 5, 6, 7, 8, 9, 11, 14
    bool bPreverb = false;

    vector<CEString>::iterator itP = m_vecAlternatingPreverbs.begin();
    for (; itP != m_vecAlternatingPreverbs.end()&&!bPreverb; ++itP)
    {
        if (sVerbForm.bStartsWith (*itP))
        {
            sPreverb = *itP;
            bPreverb = true;
        }
    }

    if (!bPreverb)
    {
        itP = m_vecAlternatingPreverbsWithVoicing.begin();
        for (; itP != m_vecAlternatingPreverbsWithVoicing.end()&&!bPreverb; ++itP)
        {
            if (sVerbForm.bStartsWith (*itP))
            {
                sPreverb = *itP;    
                bPreverb = true;
                bVoicing = true;
            }
        }
    }

    if (!bPreverb)
    {
        return H_FALSE;
    }

    if (sVerbForm.uiLength() < sPreverb.uiLength() + 2)
    {
        ASSERT(0);
        ERROR_LOG (L"Stem too short.");
        return H_ERROR_INVALID_ARG;
    }

    return H_NO_ERROR;

}       //  eGetAlternatingPreverb(...)
ET_ReturnCode CFormBuilderPronounAdj::eGetStressPositions (const CEString& sEnding,
                                                           ET_StressLocation eStressType,
                                                           vector<int>& vecStressPos)
{
    ET_ReturnCode rc = H_NO_ERROR;

//    CEString sLemma (sLemma);
    m_sLemma.SetVowels (g_szRusVowels);

    int iStressPos = -1;

    if (STRESS_LOCATION_STEM == eStressType)
    {
        rc = eGetStemStressPositions (m_sLemma, m_eSubparadigm, vecStressPos);
        if (rc != H_NO_ERROR)
        {
            return rc;
        }
    }
    else if (STRESS_LOCATION_ENDING == eStressType)
    {
        if (sEnding.uiNSyllables() < 1)
        {
            iStressPos = m_sLemma.uiNSyllables() - 1;
        }
        else
        {
            if (L"мс-п" == m_pLexeme->sInflectionType() &&
                (L"его" == sEnding || L"ему" == sEnding ||
                 L"ого" == sEnding || L"ому" == sEnding))
            {
                iStressPos = m_sLemma.uiNSyllables() + 1;    // одног<о, твоем<у
            }
            else
            {
                iStressPos = m_sLemma.uiNSyllables();
            }
        }

//        hGetEndingStressPosition (str_Lemma, sEnding, i_stressPos);
        vecStressPos.push_back (iStressPos);
    }
    else
    {
        ASSERT(0);
        ERROR_LOG (L"Illegal stress type.");
        return H_ERROR_INVALID_ARG;
    }

    return rc;

}   //  eGetStressPositions (...)
Ejemplo n.º 6
0
int CAnalyzer::iCheckEndings(vector<CHasher>& vecPossibleWordforms,
                             vector<stStemLinks>& vecStems,
                             CEString sLeft,
                             CEString sRight,
                             int iStressPosEnding)
// If pvec_stems_id IS NOT empty:
// For every stem in pvec_stems_id, take the corresponding endings table
// and look whether it contains an ending equal to sRight;
// for every such ending, add a wordform to vecPossibleWordforms.
//
// If pvec_stems_id IS empty:
// Look for an ending equal to sRight; for every such ending,
// build a wordform and store it in vecPossibleWordforms.
// (Identical wordforms are stored as one wordform.)
{
    if (m_pDb == NULL) // || vecStems == NULL)
    {
        return -1;
    }
    static vector<int> vecGram;
    CEString str_query, sLemma;
    vector<CEString> vecLemma;

    for (vector<stStemLinks>::iterator itStems = vecStems.begin();
        itStems != vecStems.end(); itStems++)
    {
        // For each *itStems look up the endings table ID in DB, then in this table try to find
        // endings which are equal to sRight. For each ending found, write the parameters
        // to tmpWf and then push_back tmpWf to vecPossibleWordforms:
        vecGram.clear();
        vecGram = arr_freq_endings[(*itStems).iEndingsLink].m_vecFind (sRight, iStressPosEnding);
        if (vecGram.empty())
        {
            continue;
        }
        for (vector<int>::iterator iter_endings = vecGram.begin();
            iter_endings != vecGram.end(); iter_endings++)
        {
            CHasher tmpWf;
            tmpWf.hDecodeHash(*iter_endings);
            tmpWf.m_llLexemeId = (*itStems).llLexemeId;
            tmpWf.m_sLemma = (*itStems).sLemma;
            //tmpWf.str_WordForm = sLeft + sRight;
            //h_AddClassifyingCategories(&tmpWf);
            vecPossibleWordforms.push_back (tmpWf);
        }
        vecLemma.clear(); // that vector is different for every stem found
    }

    if (vecStems.empty())
    // Try to guess the lexeme
    {
        if (sLeft.uiLength() <= 2)
        {
            return 0;
        }
        vector<int> vec_i_possible_ETs;
//        pair<unordered_multimap<wstring, int>::iterator,
//             unordered_multimap<wstring, int>::iterator> pair_search_result = umap_endings2subtbl.equal_range((wstring)sRight);
        pair<multimap<CEString, int>::iterator,
             multimap<CEString, int>::iterator> pair_search_result = umap_endings2subtbl.equal_range(sRight);
        for (; pair_search_result.first != pair_search_result.second; ++pair_search_result.first)
        {
            vec_i_possible_ETs.push_back(pair_search_result.first->second);
        }
        for (vector<int>::iterator iter_ET = vec_i_possible_ETs.begin();
             iter_ET != vec_i_possible_ETs.end();
             ++iter_ET)
        {
            if (arr_freq_endings[*iter_ET].m_sStemFinal.uiLength() > 0 &&
//                !regex_match(sLeft, (const wregex)(L"^.*(" + arr_freq_endings[*iter_ET].m_sStemFinale + L")$")))
                !sLeft.bRegexMatch (L"^.*(" + arr_freq_endings[*iter_ET].m_sStemFinal + L")$"))
            {
                continue;
            }
            if (sLeft.uiLength() <= arr_freq_endings[*iter_ET].m_iCutRight)
            {
                continue;
            }
            vecGram.clear();
            vecGram = arr_freq_endings[*iter_ET].m_vecFind(sRight, -2);
            if (vecGram.empty())
            {
                continue;
            }
            for (vector<int>::iterator itHash = vecGram.begin();
                 itHash != vecGram.end(); ++itHash)
            {
                CHasher tmpWf;
                tmpWf.m_sLemma = sLeft.sSubstr(0, sLeft.uiLength() - arr_freq_endings[*iter_ET].m_iCutRight) + arr_freq_endings[*iter_ET].m_sLemmaFinal;
                if (!bIsValidLemma (tmpWf.m_sLemma))
                {
                    continue;
                }
                // Check if what we've found is a new wordform
                bool bExists = false;
                for (vector<CHasher>::iterator itWf = vecPossibleWordforms.begin();
                     itWf != vecPossibleWordforms.end();
                     ++itWf)
                {
                    if ((*itWf).m_sLemma == tmpWf.m_sLemma && (*itWf).iGramHash() == *itHash)
                    {
                        bExists = true;
                    }
                }
                if (!bExists)
                {
                    tmpWf.hDecodeHash(*itHash);
                    tmpWf.m_llLexemeId = 0;
                    vecPossibleWordforms.push_back(tmpWf);
                }
            }
        }
    }
    vecGram.clear();
    return 0;
}
Ejemplo n.º 7
0
int CAnalyzer::iAnalyze(CEString sWordform,
                        vector<CHasher>& vecPossibleWordforms,
                        BOOL bGuess)
{
    // Be careful: it changes the input
    if (sWordform.uiLength() <= 0 || m_pDb == NULL) // || vecPossibleWordforms == NULL)
    {
        return -1;
    }

// Multiple stress marks??

    CEString sWordformOriginal(sWordform);
    int iStressPosStem = -1, iStressPosEnding = -1;
    int iStressPos = sWordform.uiFindOneOf(0, L"<\u0301");
    if (ecNotFound == iStressPos)
    {
        iStressPos = -1;
    }
    else
    {
        sWordform.sErase(iStressPos, 1);
    }

/*
    wsmatch result;
    bool b_match = regex_match(wstring(sWordform), result, (const wregex)L"^([^<\u0301]*)([<\u0301])(.*)$");
    if (b_match == true)
    {
        CEString sLeft = (CEString)result[1];
        CEString str_delimiter = (CEString)result[2];
        CEString sRight = (CEString)result[3];
        sWordform = sLeft + sRight;
        if (str_delimiter[0] == L'<')       // кор<ова
        {
            iStressPos = sLeft.length();
        }
        else                                // коро\u0301ва
        {
            iStressPos = sLeft.length() - 1;
        }
    }
    else
    {
        iStressPos = -1;
    }
*/

    wsmatch result;
    bool bMatch = sWordform.bRegexMatch(L"^([^<\u0301]*)([<\u0301])(.*)$");
    if (bMatch == true)
    {
        CEString sLeft = sWordform.sGetRegexMatch(0);
        CEString sDelimiter = sWordform.sGetRegexMatch(1);
        CEString sRight = sWordform.sGetRegexMatch(2);
        sWordform = sLeft + sRight;
        if (sDelimiter[0] == L'<')       // кор<ова
        {
            iStressPos = sLeft.uiLength();
        }
        else                                // коро\u0301ва
        {
            iStressPos = sLeft.uiLength() - 1;
        }
    }
    else
    {
        iStressPos = -1;
    }

    CEString sLeft, sRight;
    vector<stStemLinks> vecStems;
    vecPossibleWordforms.clear();
    for (int iLeft = sWordform.uiLength(); iLeft >= 0; --iLeft)
    {
        sLeft = sWordform.sSubstr(0, iLeft);
        sRight = sWordform.sSubstr(iLeft, sWordform.uiLength() - iLeft);

        // Stress positions for the stem and the ending
        if (iStressPos == -1)
        {
            iStressPosStem = iStressPosEnding = -2;
        }
        else if (iStressPos >= sLeft.uiLength())
        {
            iStressPosStem = -1;
            iStressPosEnding = iStressPos - sLeft.uiLength();
        }
        else
        {
            iStressPosStem = iStressPos;
            iStressPosEnding = -1;
        }
        
        vecStems.clear();
        iLookUpStems(vecStems, sLeft, iStressPosStem);
        if (vecStems.empty())
        {
            continue;
        }
        iCheckEndings(vecPossibleWordforms, vecStems, sLeft, sRight, iStressPosEnding);
    }

    // If we have no result, try cutting of possible prefixes
    if (vecPossibleWordforms.empty())
    {
        for (int iLeft = min(sWordformOriginal.uiLength(), 4); iLeft >= 1; --iLeft)
        {
            sLeft = sWordformOriginal.sSubstr(0, iLeft);
            sRight = sWordformOriginal.sSubstr(iLeft, sWordformOriginal.uiLength() - iLeft);
            if (iLeft == 4)
            {
                if (sLeft == L"пол-")
                {
                    int iResult = iAnalyze(sRight, vecPossibleWordforms, bGuess);
                    if (iResult > 0)
                    {
                        for (int iWf = iResult - 1; iWf >= 0; --iWf)
                        {
                            if (vecPossibleWordforms[iWf].m_ePos != POS_NOUN ||
                                vecPossibleWordforms[iWf].m_eNumber != NUM_SG ||
                                vecPossibleWordforms[iWf].m_eCase != CASE_GEN)
                            {
                                vecPossibleWordforms.erase(vecPossibleWordforms.begin() + iWf);
                            }
                        }
                        if (vecPossibleWordforms.size() > 0)
                        {
                            return vecPossibleWordforms.size();
                        }
                    }
                }
            }
            else if (iLeft == 3)
            {
                if (sLeft == L"пол")
                {
                    int iResult = iAnalyze(sRight, vecPossibleWordforms, bGuess);
                    if (iResult > 0)
                    {
                        for (int iWf = iResult - 1; iWf >= 0; --iWf)
                        {
                            if (vecPossibleWordforms[iWf].m_ePos != POS_NOUN ||
                                vecPossibleWordforms[iWf].m_eNumber != NUM_SG ||
                                vecPossibleWordforms[iWf].m_eCase != CASE_GEN)
                            {
                                vecPossibleWordforms.erase(vecPossibleWordforms.begin() + iWf);
                            }
                        }
                        if (vecPossibleWordforms.size() > 0)
                        {
                            return vecPossibleWordforms.size();
                        }
                    }
                }
            }
            else if (iLeft == 2)
            {
                if (sLeft == L"не")
                {
                    int iResult = iAnalyze(sRight, vecPossibleWordforms, bGuess);
                    if (iResult > 0)
                    {
                        for (int iWf = iResult - 1; iWf >= 0; --iWf)
                        {
                            if (vecPossibleWordforms[iWf].m_ePos != POS_NOUN ||
                                vecPossibleWordforms[iWf].m_ePos != POS_ADJ ||
                                vecPossibleWordforms[iWf].m_ePos != POS_VERB ||
                                (vecPossibleWordforms[iWf].m_ePos == POS_VERB &&
                                 (vecPossibleWordforms[iWf].m_eSubparadigm == SUBPARADIGM_PRESENT_TENSE ||
                                  vecPossibleWordforms[iWf].m_eSubparadigm == SUBPARADIGM_PAST_TENSE ||
                                  vecPossibleWordforms[iWf].m_eSubparadigm == SUBPARADIGM_INFINITIVE)))
                            {
                                vecPossibleWordforms.erase(vecPossibleWordforms.begin() + iWf);
                            }
                        }
                        if (vecPossibleWordforms.size() > 0)
                        {
                            return vecPossibleWordforms.size();
                        }
                    }
                }
            }
        }
    }
    // Now, if we haven't found anything, we may guess the lexeme
    if (vecPossibleWordforms.empty() && bGuess == TRUE)
    {
        for (int iLeft = 0; iLeft <= sWordform.uiLength(); ++iLeft)
        {
            sLeft = sWordform.sSubstr (0, iLeft);
            sRight = sWordform.sSubstr (iLeft, sWordform.uiLength() - iLeft);

            // Stress positions for the stem and the ending
            if (iStressPos == -1)
            {
                iStressPosStem = iStressPosEnding = -2;
            }
            else if (iStressPos >= sLeft.uiLength())
            {
                iStressPosStem = -1;
                iStressPosEnding = iStressPos - sLeft.uiLength();
            }
            else
            {
                iStressPosStem = iStressPos;
                iStressPosEnding = -1;
            }
            vecStems.clear();
            iCheckEndings (vecPossibleWordforms, vecStems, sLeft, sRight, iStressPosEnding);
            if ((bContainsPlausibleVariants (vecPossibleWordforms) && sRight.uiLength() <= 3) ||
                vecPossibleWordforms.size() >= 4)
            {
                break;
            }
        }
        if (vecPossibleWordforms.size() > 4)
        {
            LeaveMostPlausible (vecPossibleWordforms);
        }
    }
    return vecPossibleWordforms.size();
}
Ejemplo n.º 8
0
ET_ReturnCode CFormBuilderNouns::eBuild()
{
    ASSERT(m_pLexeme);   // we assume base class ctor took care of this

    ET_ReturnCode rc = H_NO_ERROR;

    m_pEndings = new CNounEndings(m_pLexeme);
    if (NULL == m_pEndings)
    {
        return H_ERROR_POINTER;
    }

    if (rc != H_NO_ERROR)
    {
        return rc;
    }

    ET_Animacy eAnimacy = m_pLexeme->eAnimacy();
    ET_Gender eoGender = m_pLexeme->eGender();

    CHasher gramIterator;
    gramIterator.Initialize(eoGender, eAnimacy);
    do
    {
        if ((L"мн." == m_pLexeme->sMainSymbol()) && (gramIterator.m_eNumber == NUM_SG))
        {
            continue;
        }

        if (NUM_PL == gramIterator.m_eNumber && 
            (CASE_PART == gramIterator.m_eCase || CASE_LOC == gramIterator.m_eCase))
        {
            continue;
        }

        if (CASE_PART == gramIterator.m_eCase && !m_pLexeme->bSecondGenitive())
        {
            continue;
        }

        if (CASE_LOC == gramIterator.m_eCase && !m_pLexeme->bSecondLocative())
        {
            continue;
        }

        // Handle acc ending
        ET_Case eEndingCase = gramIterator.m_eCase;
        if (CASE_ACC == gramIterator.m_eCase)
        {
            rc = eHandleAccEnding (gramIterator.m_eNumber, eEndingCase);
            if (rc != H_NO_ERROR)
            {
                return rc;
            }
        }
        if (CASE_PART == gramIterator.m_eCase || CASE_LOC == gramIterator.m_eCase)
        {
            eEndingCase = CASE_DAT;
        }

        CEString sLemma (m_pLexeme->sGraphicStem());
        if (m_pLexeme->bHasIrregularForms())
        {
            bool bSkipRegular = false;
            rc = eCheckIrregularForms (gramIterator.m_eGender, 
                                       gramIterator.m_eAnimacy,
                                       gramIterator.m_eCase,
                                       eEndingCase,
                                       gramIterator.m_eNumber,
                                       bSkipRegular);
            if (rc != H_NO_ERROR)
            {
                return rc;
            }
            if (bSkipRegular)
            {
                // Workaround for lack of "исх. форма иррег." mark in current source
                if (GENDER_M == gramIterator.m_eGender && NUM_SG == gramIterator.m_eNumber && CASE_NOM == gramIterator.m_eCase)
                {
                    m_bIrregularSourceForm = true;
                }
                continue;
            }
        }

        rc = eHandleStemAugment (sLemma, gramIterator.m_eNumber, gramIterator.m_eCase);
        if (rc != H_NO_ERROR)
        {
            return rc;
        }

        ET_StressLocation eStress = STRESS_LOCATION_UNDEFINED;
        if (CASE_LOC == gramIterator.m_eCase)
        {
            eStress = STRESS_LOCATION_ENDING;
        }
        else
        {
            rc = eGetStressType (gramIterator.m_eNumber, eEndingCase, eStress);
            if (rc != H_NO_ERROR)
            {
                return rc;
            }
        }

        ((CNounEndings *)m_pEndings)->eSelect(gramIterator.m_eNumber, eEndingCase, eStress);
        int iNumEndings = m_pEndings->iCount();
        if (iNumEndings < 1)
        {
            if (m_pLexeme->iType() != 0)
            {
                ASSERT(0);
                ERROR_LOG(L"No endings");
            }
            continue;
        }

        CEString sSavedLemma (sLemma);      // lemma can change, e.g. because of a fleetimg vowel
        for (int iEnding = 0; iEnding < iNumEndings; ++iEnding, sLemma = sSavedLemma)
        {
            // Get ending and modify as necessary
            CEString sEnding;
            unsigned __int64 llEndingKey = -1;
            rc = m_pEndings->eGetEnding(iEnding, sEnding, llEndingKey);
            if (rc != H_NO_ERROR)
            {
                return rc;
            }

            if (8 == m_pLexeme->iType() && GENDER_N != m_pLexeme->eGender())
            {
                if (sLemma.bEndsWithOneOf (L"шжчщц"))
                {
                    if (sEnding.bStartsWith (L"я"))
                    {
                        continue;
                    }
                }
                else
                {
                    if (sEnding.bStartsWith (L"а"))
                    {
                        continue;
                    }
                }
            }

            bool bHasFleetingVowel = false;
            rc = eFleetingVowelCheck (gramIterator.m_eNumber, 
                                      eEndingCase,
                                      gramIterator.m_eGender, 
                                      eStress,
                                      SUBPARADIGM_NOUN,
                                      sEnding,
                                      sLemma);
            if (rc != H_NO_ERROR)
            {
                continue;
            }

            vector<int> vecStress;
            rc = eGetStressPositions (sLemma, sEnding, eStress, vecStress);
            if (rc != H_NO_ERROR)
            {
                continue;
            }

            CWordForm * pWordForm = NULL;
            rc = eCreateFormTemplate (gramIterator.m_eNumber, gramIterator.m_eCase, sLemma, pWordForm);
            if (rc != H_NO_ERROR)
            {
                continue;
            }

            if (1 == vecStress.size() || m_pLexeme->bIsMultistressedCompound())
            {
                vector<int>::iterator itStressPos = vecStress.begin();
                for (; itStressPos != vecStress.end(); ++itStressPos)
                {
                    pWordForm->m_mapStress[*itStressPos] = STRESS_PRIMARY;  // primary
                    rc = eHandleYoAlternation (eStress, *itStressPos, pWordForm->m_sLemma, sEnding);
                    if (rc != H_NO_ERROR)
                    {
                        continue;
                    }
                    pWordForm->m_sEnding = sEnding;
                    pWordForm->m_llEndingDataId = llEndingKey;
                    pWordForm->m_sWordForm = pWordForm->m_sLemma + sEnding;
                }
                m_pLexeme->AddWordForm (pWordForm);                        
            }
            else
            {
                vector<int>::iterator itStressPos = vecStress.begin();
                for (; itStressPos != vecStress.end(); ++itStressPos)
                {
                    if (itStressPos != vecStress.begin())
                    {
                        CWordForm * pwfVariant = NULL;
                        CloneWordForm (pWordForm, pwfVariant);
                        pwfVariant->m_mapStress.clear();
                        pWordForm = pwfVariant;
                    }
                    pWordForm->m_mapStress[*itStressPos] = STRESS_PRIMARY;  // primary
                    rc = eHandleYoAlternation (eStress, *itStressPos, pWordForm->m_sLemma, sEnding);
                    if (rc != H_NO_ERROR)
                    {
                        continue;
                    }
                    pWordForm->m_sWordForm = pWordForm->m_sLemma + sEnding;
                    pWordForm->m_sEnding = sEnding;
                    pWordForm->m_llEndingDataId = llEndingKey;

                    m_pLexeme->AddWordForm (pWordForm);
                }
            }
        }   //  for (int iEnding = 0; ... )
    
    } while (gramIterator.bIncrement());

    return H_NO_ERROR;

}    //  eBuildNounForms()
Ejemplo n.º 9
0
ET_ReturnCode CFormBuilderNouns::eHandleStemAugment (CEString& sLemma, ET_Number eNumber, ET_Case eCase)
{
    ASSERT(m_pLexeme);   // we assume base class ctor took care of this

    if (m_pLexeme->iStemAugment() < 1)
    {
        return H_NO_ERROR;
    }

    if (1 == m_pLexeme->iType())
    {
        sLemma.sErase (sLemma.uiLength()-2, 2);  // римлянин, южанин, армянин
    }
    if (3 == m_pLexeme->iType())
    {
        CEString& sGs = m_pLexeme->sGraphicStem();
        if (NUM_SG == eNumber)
        {
            if ((CASE_NOM == eCase) || 
                (ANIM_NO == m_pLexeme->eAnimacy() && CASE_ACC == eCase))
            {
                return H_NO_ERROR;
            }
            else
            {
                sLemma.sErase (sLemma.uiLength()-2, 1);
                return H_NO_ERROR;
            }
        }
        if (NUM_PL == eNumber)
        {
            if (sGs.bEndsWith (L"онок"))
            {
                sLemma.sErase (sLemma.uiLength()-4, 4);
                sLemma += L"ат";
                return H_NO_ERROR;
            }
            if (sGs.bEndsWith (L"ёнок"))
            {
                sLemma.sErase (sLemma.uiLength()-4, 4);
                sLemma += L"ят";
                return H_NO_ERROR;
            }
            if (sGs.bEndsWith (L"оночек"))
            {
                sLemma.sErase (sLemma.uiLength()-6, 6);
                if ((CASE_GEN == eCase) ||                        
                    (CASE_ACC == eCase && ANIM_YES == m_pLexeme->eAnimacy()))
                                                     // they all should be animate?
                {
                    sLemma += L"аток";
                }
                else
                {
                    sLemma += L"атк";
                }
                return H_NO_ERROR;
            }
            if (sGs.bEndsWith (L"ёночек"))
            {
                sLemma.sErase (sLemma.uiLength()-6, 6);
                if ((CASE_GEN == eCase) ||                        
                    (CASE_ACC == eCase && ANIM_YES == m_pLexeme->eAnimacy()))
                                                     // they all should be animate?
                {
                    sLemma += L"яток";
                }
                else
                {
                    sLemma += L"ятк";
                }
                return H_NO_ERROR;
            }
        }
    }       //  if (3 == m_pLexeme->i_Type)
    if (8 == m_pLexeme->iType())
    {
        if (NUM_SG == eNumber)
        {
            if ((CASE_NOM == eCase) || 
                (ANIM_NO == m_pLexeme->eAnimacy() && CASE_ACC == eCase))
            {
                return H_NO_ERROR;
            }
            else
            {
                sLemma += L"ен";
            }
        }
        else
        {
            sLemma += L"ен";
        }
    }

    return H_NO_ERROR;

}   //  eHandleStemAugment (...)
Ejemplo n.º 10
0
ET_ReturnCode CFormBuilderShortAdj::eGetStressTypes (ET_Number eNumber, 
                                                     ET_Gender eGender, 
                                                     vector<ET_StressLocation>& vecStressType)
{
    ASSERT(m_pLexeme);   // we assume base class ctor took care of this

    ET_ReturnCode rc = H_NO_ERROR;

    if (NUM_PL == eNumber && GENDER_UNDEFINED != eGender)
    {
        ASSERT(0);
        ERROR_LOG (L"Unexpected gender/number values.");
        return H_ERROR_INVALID_ARG;
    }

    if (NUM_SG == eNumber && GENDER_UNDEFINED == eGender)
    {
        ASSERT(0);
        ERROR_LOG (L"Unexpected gender/number values.");
        return H_ERROR_INVALID_ARG;
    }

//    if (GENDER_M == eGender)
//    {
//        ASSERT (NUM_SG == eNumber);
//        vecStressType.push_back (STRESS_LOCATION_STEM);
//        return H_NO_ERROR;
//    }

    //
    // Exception: part past passive short ending in stressed -Annyj/-jAnnyj;
    // see GDRL p. 86 footnote 4
    //
    if (SUBPARADIGM_PART_PAST_PASS_SHORT == m_eSubparadigm)
    {
        CGramHasher hasher (POS_VERB, SUBPARADIGM_PART_PAST_PASS_LONG, CASE_NOM, NUM_SG, 
                            GENDER_M, PERSON_UNDEFINED, ANIM_NO, m_pLexeme->eAspect(), 
                            m_pLexeme->eIsReflexive());
        CWordForm * pNSgMLong = NULL;
        rc = m_pLexeme->eWordFormFromHash (hasher.iGramHash(), 0, pNSgMLong);
        if (rc != H_NO_ERROR)
        {
            return rc;
        }
        if (NULL == pNSgMLong)
        {
            ASSERT(0);
            ERROR_LOG (L"Failed to obtain N Sg m of the long form.");
            return H_ERROR_POINTER;
        }
        
        CEString sNSgMLong (pNSgMLong->m_sWordForm);
        if (sNSgMLong.bEndsWith (L"анный") || sNSgMLong.bEndsWith (L"янный") ||
            sNSgMLong.bEndsWith (L"енный"))
        {
            map<int, ET_StressType>::iterator itStress = pNSgMLong->m_mapStress.begin();
            for (; itStress != pNSgMLong->m_mapStress.end(); ++itStress)
            {
                if (sNSgMLong.uiNSyllables()-2 == (*itStress).first 
                    && STRESS_PRIMARY == (*itStress).second)
                {
// 1. -at'/-jat' [7] -- 1a, p. 83
// 2. monosyll verbs -- same as past: zvannyj (113) and dannyj 117
    // &&&& TODO
                }
            }
        }
    }

    ET_AccentType eAt = AT_UNDEFINED;
    if (AT_UNDEFINED == m_eAccentType2)
    {
        eAt = m_eAccentType1;
    }
    else
    {
        eAt = m_eAccentType2;
    }

    switch (eAt)
    {
        case AT_UNDEFINED:
        {
            ASSERT(0);
            ERROR_LOG (L"Undefined accent type.");
            return H_ERROR_GENERAL;
        }
        case AT_A:
        {
            vecStressType.push_back (STRESS_LOCATION_STEM);

            // Type sorvana: GDRL, p. 86
            if (SUBPARADIGM_PART_PAST_PASS_SHORT == m_eSubparadigm && 
                GENDER_F == eGender && 
                AT_C == m_pLexeme->eAccentType2())
            {
                vecStressType.push_back (STRESS_LOCATION_ENDING);
            }

            return H_NO_ERROR;
        }
        case AT_A1:
        {
            if (GENDER_M == eGender)
            {
                ASSERT(NUM_SG == eNumber);
                vecStressType.push_back(STRESS_LOCATION_STEM);
                return H_NO_ERROR;
            }
            if (GENDER_F == eGender)
            {
                ASSERT (NUM_SG == eNumber);
                vecStressType.push_back (STRESS_LOCATION_STEM);
                vecStressType.push_back (STRESS_LOCATION_ENDING);
                return H_NO_ERROR;
            }
            if (GENDER_N == eGender)
            {
                ASSERT (NUM_SG == eNumber);
                vecStressType.push_back (STRESS_LOCATION_STEM);
                return H_NO_ERROR;
            }
            if (NUM_PL == eNumber)
            {
                vecStressType.push_back (STRESS_LOCATION_STEM);
                return H_NO_ERROR;
            }

            ASSERT(0);
            ERROR_LOG (L"Bad arguments.");
            return H_ERROR_INVALID_ARG;
        }
        case AT_B:
        {
            vecStressType.push_back (STRESS_LOCATION_ENDING);
            return H_NO_ERROR;
        }
        case AT_B1:
        {
            if (GENDER_M == eGender)
            {
                ASSERT(NUM_SG == eNumber);
                vecStressType.push_back(STRESS_LOCATION_ENDING);
                return H_NO_ERROR;
            }
            if (GENDER_F == eGender || GENDER_N == eGender)
            {
                ASSERT (NUM_SG == eNumber);
                vecStressType.push_back (STRESS_LOCATION_ENDING);
                return H_NO_ERROR;
            }
            if (NUM_PL == eNumber)
            {
                ASSERT (GENDER_UNDEFINED == eGender);
                vecStressType.push_back (STRESS_LOCATION_STEM);
                vecStressType.push_back (STRESS_LOCATION_ENDING);
                return H_NO_ERROR;
            }

            ASSERT(0);
            ERROR_LOG (L"Bad arguments.");
            return H_ERROR_INVALID_ARG;
        }
        case AT_C:
        {
            if (GENDER_M == eGender)
            {
                ASSERT(NUM_SG == eNumber);
                vecStressType.push_back(STRESS_LOCATION_STEM);
                return H_NO_ERROR;
            }
            if (GENDER_F == eGender)
            {
                ASSERT (NUM_SG == eNumber);
                vecStressType.push_back (STRESS_LOCATION_ENDING);
                return H_NO_ERROR;
            }
            if (GENDER_N == eGender || NUM_PL == eNumber)
            {
                vecStressType.push_back (STRESS_LOCATION_STEM);
                return H_NO_ERROR;
            }

            ASSERT(0);
            ERROR_LOG (L"Bad arguments.");
            return H_ERROR_INVALID_ARG;
        }
        case AT_C1:
        {
            if (GENDER_M == eGender)
            {
                ASSERT(NUM_SG == eNumber);
                vecStressType.push_back(STRESS_LOCATION_STEM);
                return H_NO_ERROR;
            }
            if (GENDER_F == eGender)
            {
                ASSERT (NUM_SG == eNumber);
                vecStressType.push_back (STRESS_LOCATION_ENDING);
                return H_NO_ERROR;
            }
            if (GENDER_N == eGender)
            {
                ASSERT (NUM_SG == eNumber);
                vecStressType.push_back (STRESS_LOCATION_STEM);
                return H_NO_ERROR;
            }
            if (NUM_PL == eNumber)
            {
                ASSERT (GENDER_UNDEFINED == eGender);
                vecStressType.push_back (STRESS_LOCATION_STEM);
                vecStressType.push_back (STRESS_LOCATION_ENDING);
                return H_NO_ERROR;
            }

            ASSERT(0);
            ERROR_LOG (L"Bad arguments.");
            return H_ERROR_INVALID_ARG;
        }
        case AT_C2:
        {
            if (GENDER_M == eGender)
            {
                ASSERT(NUM_SG == eNumber);
                vecStressType.push_back(STRESS_LOCATION_STEM);
                return H_NO_ERROR;
            }
            if (GENDER_F == eGender)
            {
                ASSERT (NUM_SG == eNumber);
                vecStressType.push_back (STRESS_LOCATION_ENDING);
                return H_NO_ERROR;
            }
            if (GENDER_N == eGender)
            {
                ASSERT (NUM_SG == eNumber);
                vecStressType.push_back (STRESS_LOCATION_STEM);
                vecStressType.push_back (STRESS_LOCATION_ENDING);
                return H_NO_ERROR;
            }
            if (NUM_PL == eNumber)
            {
                ASSERT (GENDER_UNDEFINED == eGender);
                vecStressType.push_back (STRESS_LOCATION_STEM);
                vecStressType.push_back (STRESS_LOCATION_ENDING);
                return H_NO_ERROR;
            }

            ASSERT(0);
            ERROR_LOG (L"Bad arguments.");
            return H_ERROR_INVALID_ARG;
        }
        default:
        {
            ASSERT(0);
            ERROR_LOG (L"Illegal accent type.");
            return H_ERROR_INVALID_ARG;
        }
    }

    return H_ERROR_INVALID_ARG;

}   // eGetStressType()
Ejemplo n.º 11
0
ET_ReturnCode CFormBuilderShortAdj::eHandleDeviations (CWordForm * pWordForm)
{
    ASSERT(m_pLexeme);   // we assume base class ctor took care of this

    ET_ReturnCode rc = H_NO_ERROR;

    try
    {
        //
        // A. Common deviations defined for adjectives (1-2)
        //
        bool bHasCD = false;
        bool bOptionalCD = false;
        int iCd = 1;
        for (; iCd <= 2; ++iCd)
        {
            map<int, bool>::iterator itCd = m_mapCommonDeviations.find (iCd);
            if (m_mapCommonDeviations.end() != itCd)
            {
                bOptionalCD = (*itCd).second;
                break;
            }
            if (m_pLexeme->bFindCommonDeviation (iCd, bOptionalCD))
            {
                break;
            }
        }

        //
        // Only adjectives in -nnyj/-nnij or participia
        //
        if (1 == iCd || 2 == iCd)
        {
            if (1 == iCd && GENDER_M != pWordForm->m_eGender)
            {
                return H_NO_MORE;
            }

            if (bOptionalCD && SUBPARADIGM_SHORT_ADJ == m_eSubparadigm)   // store both forms
            {
                CWordForm * pMVariant = NULL;
                CloneWordForm (pWordForm, pMVariant);
                m_pLexeme->AddWordForm (pMVariant);
                pWordForm = pMVariant;
            }

            if (GENDER_M == pWordForm->m_eGender)
            {
                pWordForm->m_sWordForm.sErase (pWordForm->m_sWordForm.uiLength()-1);
                if (m_bFleetingVowel)
                {
                    pWordForm->m_sWordForm.sErase (pWordForm->m_sWordForm.uiLength()-1);
                }
            }
            else
            {
                pWordForm->m_sWordForm.sErase (pWordForm->m_sWordForm.uiLength()-2, 1);
            }
        }   //  if (1 == iCd || 2 == iCd)

        //
        // Common deviations devined for verbs (7-8)
        //
        if (m_pLexeme->bHasCommonDeviation(7))
        {
            iCd = 7;
        }
        else if (m_pLexeme->bHasCommonDeviation(8))  // TODO -- what's that??
        {
            iCd = -1;
        }
        else
        {
            iCd = -1;
        }

        if (iCd > 0 && SUBPARADIGM_PART_PAST_PASS_SHORT == m_eSubparadigm)
        {
            if (m_pLexeme->bDeviationOptional(iCd))   // store both forms
            {
                CWordForm * pMVariant = NULL;
                CloneWordForm (pWordForm, pMVariant);
                m_pLexeme->AddWordForm (pMVariant);
                pWordForm = pMVariant;
            }

            map<int, ET_StressType> mapCorrectedStress;
            map<int, ET_StressType>::iterator itStressPos = pWordForm->m_mapStress.begin();
            for (; itStressPos != pWordForm->m_mapStress.end(); ++itStressPos)
            {
                if (!(*itStressPos).second)
                {
                    mapCorrectedStress[(*itStressPos).first] = STRESS_SECONDARY;
                    continue;
                }
                if ((*itStressPos).first < 1)
                {
                    ASSERT(0);
                    ERROR_LOG (L"Unexpected stress position in cd-7 or cd-8 participle.");
                    return H_ERROR_UNEXPECTED;
                }
                CEString sWf (pWordForm->m_sWordForm);
                mapCorrectedStress[sWf.uiNSyllables()-1] = STRESS_PRIMARY;
            }
        }
    }
    catch (CException& ex)
    {
        CEString sMsg (L"Exception: ");
        sMsg += ex.szGetDescription();
        ERROR_LOG (sMsg);
        return H_EXCEPTION;
    }

    return H_NO_ERROR;

}   //  eHandleDeviations (...)
Ejemplo n.º 12
0
int _tmain(int argc, _TCHAR* argv[])
{
    wstring sReplaceableB (L"0123456789012345");
    wstring sReplacedB = sReplaceableB.replace (5, 6, L"abcd");

    CEString sReplacableC(L"0123456789012345678901234567890123567890");
    CEString sReplacedCC = sReplacableC.sReplace(L"567890", L"abcd");
    CEString sReplacedC = sReplacableC.sReplace(5, 6, L"abcd");

    sReplaceableB = L"0123456789";
    sReplacedB = sReplaceableB.replace (5, 3, L"a");

    sReplaceableB = L"0123456789";
    sReplacedB = sReplaceableB.replace (8, 2, L"ab");

    sReplaceableB = L"0123456789";
    sReplacedB = sReplaceableB.replace (5, 3, L"a");

    sReplaceableB = L"0123456789";
    sReplacedB = sReplaceableB.replace (8, 2, L"abc");

    wstring sEraseableB (L"0123456789");
    wstring sErasedB = sEraseableB.erase (5, 3);

    sEraseableB = L"0123456789";
    sErasedB = sEraseableB.erase (5, 5);

    sEraseableB = L"0123456789";
    sErasedB = sEraseableB.erase (5, 7);

    sEraseableB = L"0123456789";
    sErasedB = sEraseableB.erase (5);

    sEraseableB = L"0123456789";
    sErasedB = sEraseableB.erase();

    try
    {
        sEraseableB = L"0123456789";
        sErasedB = sEraseableB.erase (12, 7);
    }
    catch (...)
    {
    }

    try
    {
        sEraseableB = L"0123456789";
        sErasedB = sEraseableB.erase (12);
    }
    catch (...)
    {
    }

    // Ctors
    CEString sEmptyString;
    if (0 != sEmptyString.uiLength() || 0 != sEmptyString.uiGetNumOfTokens() || 
        0 != sEmptyString.uiGetNumOfFields() || 0 != sEmptyString.uiGetVisibleLength())
    {
        ERROR_LOG (L"Initialization error");
    }


    CEString sCopy (sEmptyString);
    if (0 != sCopy.uiLength() || 0 != sCopy.uiGetNumOfTokens() || 
        0 != sCopy.uiGetNumOfFields() || 0 != sCopy.uiGetVisibleLength())
    {
        ERROR_LOG (L"Initialization error");
    }

    sCopy = L"0123456789";
    CEString sCopy2 (sCopy);

    sCopy2.SetBreakChars (L" -/");
    CEString sCopy3 (sCopy2);

    CEString sFromCString (L"0123456789");
    if (10 != sFromCString.uiLength() || 1 != sFromCString.uiGetNumOfTokens() || 
        1 != sFromCString.uiGetNumOfFields() || 10 != sFromCString.uiGetVisibleLength())
    {
        ERROR_LOG (L"Initialization error");
    }

// TODO: operator ()

    CEString sSquareBracketsTest (L"0123456789");
    sSquareBracketsTest[1] = L'a';
    if (sSquareBracketsTest != L"0a23456789")
    {
        ERROR_LOG (L"Square brackets operator error");
    }

    sSquareBracketsTest = L"0123456789";
    sSquareBracketsTest[0] = L'a';
    if (sSquareBracketsTest != L"a123456789")
    {
        ERROR_LOG (L"Square brackets operator error");
    }

    sSquareBracketsTest = L"0123456789";
    sSquareBracketsTest[9] = L'a';
    if (sSquareBracketsTest != L"012345678a")
    {
        ERROR_LOG (L"Square brackets operator error");
    }

    sSquareBracketsTest = L"0123456789";
    CEString sLetter = sSquareBracketsTest[1];
    if (L"1" != sLetter)
    {
        ERROR_LOG (L"Square brackets operator error");
    }

    // Comparison
    ERelation eRet = CEString::eCompare (L"1234567", L"1234567");
    if (ecEqual != eRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    eRet = CEString::eCompare (L"1234567", L"1234566");
    if (eRet != ecGreater)
    {
        ERROR_LOG (L"Comparison error");
    }

    eRet = CEString::eCompare (L"1234566", L"1234567");
    if (eRet != ecLess)
    {
        ERROR_LOG (L"Comparison error");
    }

    eRet = CEString::eCompare (L"123456", L"1234567");
    if (eRet != ecLess)
    {
        ERROR_LOG (L"Comparison error");
    }

    eRet = CEString::eCompare (L"1234567", L"123456");
    if (eRet != ecGreater)
    {
        ERROR_LOG (L"Comparison error");
    }

    eRet = CEString::eCompareNoCase (L"AbCdEfG", L"ABCDEFg");
    if (ecEqual != eRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    eRet = CEString::eCompareNoCase (L"АбВгДЕ", L"АБВГДе");
    if (ecEqual != eRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    bool bRet = CEString::bIn (L'2', L"0123456789");
    if (!bRet)
    {
        ERROR_LOG (L"bIn() failed.");
    }

    bRet = CEString::bIn (L'a', L"0123456789");
    if (bRet)
    {
        ERROR_LOG (L"bIn() failed.");
    }

    CEString sSearcheable (L"0123456789");
    unsigned int uiFindRet = sSearcheable.uiFind (L"123");
    if (1 != uiFindRet)
    {
        ERROR_LOG (L"uiFind() failed.");
    }

    uiFindRet = sSearcheable.uiFind (L"abc");
    if (ecNotFound != uiFindRet)
    {
        ERROR_LOG (L"uiFind() failed.");
    }

    sSearcheable = L"aBcDeFgHiJ";
    uiFindRet = sSearcheable.uiFindNoCase (L"bCDEF");
    if (ecNotFound == uiFindRet)
    {
        ERROR_LOG (L"uiFindNoCase() failed.");
    }

    sSearcheable = L"012345543210";
    uiFindRet = sSearcheable.uiRFind (L"5");
    if (6 != uiFindRet)
    {
        ERROR_LOG (L"uiRFind() failed.");
    }

//    unsigned int uiRFindNoCase (const wchar_t * szRhs) const

    sSearcheable = L"0123456789";
    uiFindRet = sSearcheable.uiFindFirstOf (L"234");
    if (2 != uiFindRet)
    {
        ERROR_LOG (L"uiFindFirstOf() failed.");
    }

//    unsigned int uiFindFirstOfNoCase (const wchar_t * szSet) const
    sSearcheable = L"0120120123456789";
    uiFindRet = sSearcheable.uiFindOneOf (3, L"234");
    if (5 != uiFindRet)
    {
        ERROR_LOG (L"uiFindOneOf() failed.");
    }

    sSearcheable = L"0123456789";
    uiFindRet = sSearcheable.uiFindLastOf (L"234");
    if (4 != uiFindRet)
    {
        ERROR_LOG (L"uiFindLastOf() failed.");
    }

//    unsigned int uiFindLastOfNoCase (const wchar_t * szSet) const
    bRet = sSearcheable.bStartsWith (L"012");
    if (!bRet)
    {
        ERROR_LOG (L"bStartsWith() failed.");
    }

    bRet = sSearcheable.bStartsWith (L"234");
    if (bRet)
    {
        ERROR_LOG (L"bStartsWith() failed.");
    }

    sSearcheable = L"aBcDeFgHiJ";
    bRet = sSearcheable.bStartsWithNoCase (L"abcd");
    if (!bRet)
    {
        ERROR_LOG (L"bStartsWithNoCase() failed.");
    }

    sSearcheable = L"0123456789";
    bRet = sSearcheable.bStartsWithOneOf (L"012");
    if (!bRet)
    {
        ERROR_LOG (L"bStartsWithOneOf() failed.");
    }

    bRet = sSearcheable.bStartsWithOneOf (L"123");
    if (bRet)
    {
        ERROR_LOG (L"bStartsWithOneOf() failed.");
    }

    sSearcheable = L"aBcDeFgHiJ";
    bRet = sSearcheable.bStartsWithOneOfNoCase (L"abc");
    if (!bRet)
    {
        ERROR_LOG (L"bStartsWithOneOf() failed.");
    }

    bRet = sSearcheable.bStartsWithOneOfNoCase (L"bc");
    if (bRet)
    {
        ERROR_LOG (L"bStartsWithOneOf() failed.");
    }

    sSearcheable = L"аБвГдЕёжзи";
    bRet = sSearcheable.bStartsWithOneOfNoCase (L"абв");
    if (!bRet)
    {
        ERROR_LOG (L"bStartsWithOneOf() failed.");
    }

    bRet = sSearcheable.bStartsWithOneOfNoCase (L"бв");
    if (bRet)
    {
        ERROR_LOG (L"bStartsWithOneOf() failed.");
    }

    sSearcheable = L"0123456789";
    bRet = sSearcheable.bEndsWith (L"789");
    if (!bRet)
    {
        ERROR_LOG (L"bStartsWith() failed.");
    }

    bRet = sSearcheable.bEndsWith (L"123");
    if (bRet)
    {
        ERROR_LOG (L"bStartsWith() failed.");
    }

    sSearcheable = L"abcdeFgHiJ";
    bRet = sSearcheable.bEndsWithNoCase (L"hij");
    if (!bRet)
    {
        ERROR_LOG (L"bEndsWithNoCase() failed.");
    }

    bRet = sSearcheable.bEndsWithNoCase (L"ghi");
    if (bRet)
    {
        ERROR_LOG (L"bEndsWithNoCase() failed.");
    }

    sSearcheable = L"абвгдЕёЖзИ";
    bRet = sSearcheable.bEndsWithNoCase (L"жзи");
    if (!bRet)
    {
        ERROR_LOG (L"bEndsWithNoCase() failed.");
    }

    bRet = sSearcheable.bEndsWithNoCase (L"ёжз");
    if (bRet)
    {
        ERROR_LOG (L"bEndsWithNoCase() failed.");
    }

    sSearcheable = L"0123456789";
    bRet = sSearcheable.bEndsWithOneOf (L"ab9");
    if (!bRet)
    {
        ERROR_LOG (L"bEndsWithOneOf() failed.");
    }

    bRet = sSearcheable.bEndsWithOneOf (L"ab8");
    if (bRet)
    {
        ERROR_LOG (L"bEndsWithOneOf() failed.");
    }

    sSearcheable = L"aBcDeFgHiJ";
    bRet = sSearcheable.bEndsWithOneOfNoCase (L"abj");
    if (!bRet)
    {
        ERROR_LOG (L"bEndsWithOneOfNoCase failed.");
    }

    bRet = sSearcheable.bEndsWithOneOfNoCase (L"abc");
    if (bRet)
    {
        ERROR_LOG (L"bEndsWithOneOfNoCase failed.");
    }

    sSearcheable = L"абвгдЕёЖзИ";
    bRet = sSearcheable.bEndsWithOneOfNoCase (L"abи");
    if (!bRet)
    {
        ERROR_LOG (L"bEndsWithOneOfNoCase failed.");
    }

    bRet = sSearcheable.bEndsWithOneOfNoCase (L"abc");
    if (bRet)
    {
        ERROR_LOG (L"bEndsWithOneOfNoCase failed.");
    }

    // Operators
    CEString sLhs (L"01234");
    CEString sRhs (L"56789");
    
    bRet = (sLhs == sRhs);
    if (bRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    bRet = (sLhs == L"01234");
    if (!bRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    bRet = (L"01234" == sLhs);
    if (!bRet)
    {
        ERROR_LOG (L"Comparison error");
    }

//    CString csLhs (L"01234");
//    CString csRhs (L"56789");
//    bRet = (L"01234" == csLhs);
//    if (!bRet)
//    {
//        ERROR_LOG (L"CString behavior does not match CEString behavior");
//    }

    bRet = (sLhs < sRhs);
    if (!bRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    bRet = (sLhs > sRhs);
    if (bRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    bRet = (sLhs <= sRhs);
    if (!bRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    bRet = (sLhs >= sRhs);
    if (bRet)
    {
        ERROR_LOG (L"Comparison error");
    }

    bRet = (sLhs >= sLhs);
    if (!bRet)
    {
        ERROR_LOG (L"Comparison error");
    }


    sEmptyString = L"0123456";
    if (sEmptyString != L"0123456")
    {
        ERROR_LOG (L"Assignemnt or comparison error");
    }

    sEmptyString = sRhs;
    if (sEmptyString != sRhs)
    {
        ERROR_LOG (L"Assignemnt or comparison error");
    }

    //CEString sResult = sLhs + sRhs;
    //if (sResult != L"0123456789")
    //{
    //    ERROR_LOG (L"Concatenation or comparison error");
    //}
    //sResult += L"<--Concatenated";
    //if (sResult != L"0123456789<--Concatenated")
    //{
    //    ERROR_LOG (L"Concatenation or comparison error");
    //}

    CEString sInsertable (L"0123789");
    CEString sInserted = sInsertable.sInsert (4, L"456");
    if (sInserted != sInsertable || sInsertable != L"0123456789")
    {
        ERROR_LOG (L"Insertion error");
    }

    sInsertable = L"012456789";
    sInserted = sInsertable.sInsert (3, L'3');
    if (sInserted != sInsertable || sInsertable != L"0123456789")
    {
        ERROR_LOG (L"Insertion error");
    }

    CEString sErasable (L"012abcd3456789");
    CEString sErased = sErasable.sErase (3, 4);
    if (sErased != sErasable || sErasable != L"0123456789")
    {
        ERROR_LOG (L"Erase error");
    }
    
    sErasable = L"0123456789";
    sErased = sErasable.sErase (3, 7);
    if (sErased != sErasable || sErasable != L"012")
    {
        ERROR_LOG (L"Erase error");
    }

    sErasable = L"0123456789";
    sErased = sErasable.sErase (3, 40);
    if (sErased != sErasable || sErasable != L"012")
    {
        ERROR_LOG (L"Erase error");
    }

    sErasable = L"0123456789";
    sErased = sErasable.sErase (3);
    if (sErased != sErasable || sErasable != L"012")
    {
        ERROR_LOG (L"Erase error");
    }

    sErasable = L"0123456789a";
    sErased = sErasable.sErase (10);
    if (sErased != sErasable || sErasable != L"0123456789")
    {
        ERROR_LOG (L"Erase error");
    }

    sErasable.Erase();
    if (!sErasable.bIsEmpty() || sErasable.uiLength() != 0)
    {
        ERROR_LOG (L"Erase error");
    }

    sErasable = L"0123456789";

    CEString sConvertToUppercase(L"aAbBcC");
    sConvertToUppercase.ToUpper();
    if (sConvertToUppercase != L"AABBCC")
    {
        ERROR_LOG(L"ToUpper error");
    }

    sConvertToUppercase = CEString::sToUpper(L"aAbBcC");
    if (sConvertToUppercase != L"AABBCC")
    {
        ERROR_LOG(L"ToUpper error");
    }

    CEString sConvertToUppercaseCyr(L"aABbcCаАбБвВ");
    sConvertToUppercaseCyr.ToUpper();
    if (sConvertToUppercaseCyr != L"AABBCCААББВВ")
    {
        ERROR_LOG(L"ToUpper error for Cyrillic");
    }

    sConvertToUppercaseCyr = CEString::sToUpper(L"aAbBcCаАбБвВ");
    if (sConvertToUppercaseCyr != L"AABBCCААББВВ")
    {
        ERROR_LOG(L"sToUpper error for Cyrillic");
    }

    CEString sConvertToLowercase(L"aABbcC");
    sConvertToLowercase.ToLower();
    if (sConvertToLowercase != L"aabbcc")
    {
        ERROR_LOG(L"ToLower error");
    }

    sConvertToLowercase = CEString::sToLower(L"aAbBcC");
    if (sConvertToLowercase != L"aabbcc")
    {
        ERROR_LOG(L"ToLower error");
    }

    CEString sConvertToLowercaseCyr(L"aABbcCаАбБвВ");
    sConvertToLowercaseCyr.ToLower();
    if (sConvertToLowercaseCyr != L"aabbccааббвв")
    {
        ERROR_LOG(L"ToLower error for Cyrillic");
    }

    sConvertToLowercaseCyr = CEString::sToLower(L"aAbBcCаАбБвВ");
    if (sConvertToLowercaseCyr != L"aabbccааббвв")
    {
        ERROR_LOG(L"sToLower error for Cyrillic");
    }

    CEString sFromAscii = CEString::sToString("abcdefgxyzABCDEFGXYZ01234567890.,!");
    if (sFromAscii != L"abcdefgxyzABCDEFGXYZ01234567890.,!")
    {
        ERROR_LOG(L"sToString error for ascii conversion");
    }

    CEString sReplaceable(L"01abcd6789");
    CEString sReplaced = sReplaceable.sReplace (2, L"2345");
    if (sReplaced != sReplaceable || sReplaceable != L"0123456789")
    {
        ERROR_LOG (L"Replace error");
    }

    sReplaceable = L"0123456abc";
    sReplaced = sReplaceable.sReplace (7, L"789");
    if (sReplaced != sReplaceable || sReplaceable != L"0123456789")
    {
        ERROR_LOG (L"Replace error");
    }

    sReplaceable = L"0123456a89";
    sReplaced = sReplaceable.sReplace (7, L'7');
    if (sReplaced != sReplaceable || sReplaceable != L"0123456789")
    {
        ERROR_LOG (L"Replace error");
    }

    sReplaceable = L"012345678a";
    sReplaced = sReplaceable.sReplace (9, L'9');
    if (sReplaced != sReplaceable || sReplaceable != L"0123456789")
    {
        ERROR_LOG (L"Replace error");
    }


    sReplaceable = L"01234abc89";
    sReplaced = sReplaceable.sReplace (5, 3, L"567");
    if (sReplaced != sReplaceable || sReplaceable != L"0123456789")
    {
        ERROR_LOG (L"Replace error");
    }

    sReplaceable = L"01234aaa6789";
    sReplaced = sReplaceable.sReplace (5, 3, L"5");
    if (sReplaced != sReplaceable || sReplaceable != L"0123456789")
    {
        ERROR_LOG (L"Replace error");
    }

    sReplaceable = L"01234567ab";
    sErased = sReplaceable.sReplace (8, 2, L"89");
    if (sReplaced != sReplaceable || sReplaceable != L"0123456789")
    {
        ERROR_LOG (L"Replace error");
    }

    sReplaceable = L"01234567a9";
    sReplaced = sReplaceable.sReplace (8, 2, L"8");
    if (sReplaced != sReplaceable || sReplaceable != L"012345678")
    {
        ERROR_LOG (L"Replace error");
    }

    sReplaceable = L"01234567ab";
    sReplaced = sReplaceable.sReplace (8, 2, L"890");
    if (sReplaced != sReplaceable || sReplaceable != L"01234567890")
    {
        ERROR_LOG (L"Replace error");
    }

    sReplaceable = L"0ё2345ё78ёё";
    sReplaceable.Replace (0, 10, L'ё', L'е');
    if (sReplaceable != L"0е2345е78ее")
    {
        ERROR_LOG (L"Replace error");
    }


    CEString sTrimmable (L"     01234     ");
    sTrimmable.TrimLeft();
    if (sTrimmable != L"01234     ")
    {
        ERROR_LOG (L"Trim or comparison error");
    }

    sTrimmable.TrimRight();
    if (sTrimmable != L"01234")
    {
        ERROR_LOG (L"Trim or comparison error");
    }

    sTrimmable = L"     01234     ";
    sTrimmable.Trim();
    if (sTrimmable != L"01234")
    {
        ERROR_LOG (L"Trim or comparison error");
    }

    sTrimmable = L"=&=&=01234&&&==";
    sTrimmable.TrimLeft (L"=&");
    if (sTrimmable != L"01234&&&==")
    {
        ERROR_LOG (L"Trim or comparison error");
    }

    sTrimmable.Trim (L"=&");
    if (sTrimmable != L"01234")
    {
        ERROR_LOG (L"Trim or comparison error");
    }

    sTrimmable = L"=&=&=01234&&&==";
    sTrimmable.Trim (L"=&");
    if (sTrimmable != L"01234")
    {
        ERROR_LOG (L"Trim or comparison error");
    }

    CEString sReversable (L"0123456789");
    sReversable.Reverse();
    if (sReversable != L"9876543210")
    {
        ERROR_LOG (L"Reversing error");
    }

    CEString sWhole (L"0123456789");
    CEString sSubstr = sWhole.sSubstr (1, 3);
    if (sSubstr != L"123")
    {
        ERROR_LOG (L"Trim or comparison error");
    }

    sSubstr = sWhole.sSubstr (7);
    if (sSubstr != L"789")
    {
        ERROR_LOG (L"Trim or comparison error");
    }

    CEString sFields (L"123 456 789");
    sFields.SetBreakChars (L" ");
    CEString sField = sFields.sGetField (1);
    if (sField != L"456")
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    StToken stToken = sFields.stGetField (0);
    if (3 != stToken.uiLength || 0 != stToken.uiOffset || ecTokenText != stToken.eType)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    stToken = sFields.stGetField (0, ecTokenSpace);
    if (1 != stToken.uiLength || 3 != stToken.uiOffset || ecTokenSpace != stToken.eType)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    try
    {
//        stToken = sFields.stGetField (99);
//        ERROR_LOG (L"Tokenizer or comparison error");   // Exception expected
    }
    catch (CException& ex)
    {
//        ::MessageBox (NULL, ex.sGetDescription().c_str(), L"Kai Exception", MB_ICONWARNING);
    }

//    ST_Token st_GetFieldFromOffset (int i_offset,
//                                    et_TokenType eo_type = ec_TokenText);

    stToken = sFields.stGetTokenFromOffset (6);
    if (3 != stToken.uiLength || 4 != stToken.uiOffset || ecTokenText != stToken.eType)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    ETokenType eType = sFields.eGetTokenType (1);
//    et_TokenType eo_GetTokenType (int i_offset, int i_at);
    if (ecTokenBreakChars != eType)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    stToken = sFields.stGetToken (1);
    if (1 != stToken.uiLength || 3 != stToken.uiOffset || ecTokenSpace != stToken.eType)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    const StToken& rstToken = sFields.rstGetToken (1);
    if (1 != stToken.uiLength || 3 != stToken.uiOffset || ecTokenSpace != stToken.eType)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    CEString sToken = sFields.sGetToken (1);
    if (sToken != L" ")
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    try
    {
        CEString sToken1 = sFields.sGetToken(999);
        if (sToken1 != L" ")
        {
            ERROR_LOG(L"Tokenizer or comparison error");
        }
    }
    catch (CException ex)
    {
        CEString sMsg(L"Exception: ");
        sMsg += ex.szGetDescription();
        ERROR_LOG(sMsg);
    }

    bool b_ = sFields.bGetNextToken(stToken);
    if (!b_ || ecTokenText != stToken.eType || 4 != stToken.uiOffset || 3 != stToken.uiLength)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    b_ = sFields.bGetPrevToken (stToken);
    if (!b_ || ecTokenBreakChars != stToken.eType || 3 != stToken.uiOffset || 1 != stToken.uiLength)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    unsigned int uiTokenNum = sFields.uiGetTokenNum (stToken);
    if (1 != uiTokenNum)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    unsigned int uiFields = sFields.uiGetNumOfFields();
    if (3 != uiFields)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    uiFields = sFields.uiGetNumOfFields (ecTokenSpace);
    if (2 != uiFields)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

//    uiFields = sFields.uiGetNumOfFields (3, 6);

    uiFields = sFields.uiNFields();
    if (3 != uiFields)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

//    uiFields = sFields.uiNFields (3, 6); 

    unsigned int uiTokens = sFields.uiGetNumOfTokens();
    if (5 != uiTokens)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    uiTokens = sFields.uiNTokens();
    if (5 != uiTokens)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    unsigned int uiVLength = sFields.uiGetVisibleLength();
    if (11 != uiVLength)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

    unsigned int uiFLength = sFields.uiGetFieldLength (1);
    if (3 != uiFLength)
    {
        ERROR_LOG (L"Tokenizer or comparison error");
    }

//    CEString s (L"abcdefg");
//    wchar_t * szData = (wchar_t*)s;

    CEString sSyllables (L"бавогузюы");
    sSyllables.SetVowels (L"аеёиоуыэюя");
    unsigned int uiSyllables = sSyllables.uiGetNumOfSyllables();
    if (5 != uiSyllables)
    {
        ERROR_LOG (L"Syllable count error");
    }

    uiSyllables = sSyllables.uiNSyllables();
    if (5 != uiSyllables)
    {
        ERROR_LOG (L"Syllable count error");
    }


    // Vowels & consonants
    unsigned int uiVowelPos = sSyllables.uiGetVowelPos (3);
    if (7 != uiVowelPos)
    {
        ERROR_LOG (L"Vowel position error");
    }
    
    unsigned int uiSyllPos = sSyllables.uiGetSyllableFromVowelPos (7);
    if (3 != uiSyllPos)
    {
        ERROR_LOG (L"Syllable position error");
    }

    {
        CEString sConvert = CEString::sToString (9999999999999);
        if (L"9999999999999" != sConvert)
        {
            ERROR_LOG(L"Large int conversion error");
        }
        int i_ = 999999;
        sConvert = CEString::sToString (i_);
        if (L"999999" != sConvert)
        {
            ERROR_LOG(L"Int conversion error");
        }
    }

    {
        CEString sConvert = CEString::sToString(999999999.9999);
        double d_ = 999999.999;
        sConvert = CEString::sToString(d_);
    }

    //
    // Done!
    //
    CLogger::pGetInstance()->Flush();

_CrtDumpMemoryLeaks();


}
Ejemplo n.º 13
0
int CT_LexPreprocessor::iClassifyStems()
// For every endings subtable, looks for the stems usable with it and
// stores up to NUM_SFX their longest common suffixes in the database
{
    if (m_pDb == NULL)
    {
        return -1;
    }
    const int MIN_NUMBER_OF_STEMS = 70;
    const int NUM_SFX = 5;
    const int MAX_NUM_SFX = 24;

    CEString sQuery, sStem;
    CEString *arr_sStems;
    CEString **parr_sSfx;
    vector<CEString> vecStems;
    int iLastSubtable = 0, iStem;

    iLastSubtable = m_pDb->iLastID(L"endings_meta");
    for (int iSubtable = 0; iSubtable <= iLastSubtable; ++iSubtable)
    {
        vecStems.clear();
        CEString sFirstLemma = L"";
        int iCutRight = 0;
        CEString sLemmaEnding = L"";

        vLongStemsBySubtable(iSubtable, 2, vecStems, sFirstLemma);
        if (vecStems.size() < MIN_NUMBER_OF_STEMS)
        {
            continue;
        }

        // Find the longest common prefix of the first stem and the corresponding lemma
        CEString* arr_sStemAndLemma;
        CEString** parr_sPfx;
        arr_sStemAndLemma = new CEString[2];
        arr_sStemAndLemma[0] = vecStems[0];
        arr_sStemAndLemma[1] = sFirstLemma;
        parr_sPfx = new CEString*;
        *parr_sPfx = new CEString[1];
        int iPfx = iLCP(arr_sStemAndLemma, parr_sPfx, 2, 1);
        if (iPfx <= 0)
        {
            continue;
        }
        CEString sCommonPfx = (*parr_sPfx)[0];
        iCutRight = vecStems[0].uiLength() - sCommonPfx.uiLength();
        if (iCutRight >= 4)
        {
            continue;
        }
        sLemmaEnding = sFirstLemma.sSubstr(sCommonPfx.uiLength(), sFirstLemma.uiLength() - sCommonPfx.uiLength());

        // Find longest common suffixes of the stems found
        iStem = 0;
        arr_sStems = new CEString[vecStems.size()];
        parr_sSfx = new CEString*;
        *parr_sSfx = new CEString[1];
        for (vector<CEString>::iterator iterStems = vecStems.begin();
            iterStems != vecStems.end();
            ++iterStems, ++iStem)
        {
            // We reverse the stem so that i_LCP could find suffixes
            // instead of prefixes
//            reverse((*iter_stems).begin(), (*iter_stems).end());
            (*iterStems).Reverse();
            arr_sStems[iStem] = *iterStems;
        }

        // several attemps
        int iSfx = 0;
        int iMaxSfx = NUM_SFX;
        while (iSfx <= 0 && iMaxSfx <= MAX_NUM_SFX)
        {
            delete[] *parr_sSfx;
            delete parr_sSfx;
            parr_sSfx = new CEString*;
            *parr_sSfx = new CEString[1];
            iSfx = iLCP(arr_sStems, parr_sSfx, vecStems.size(), iMaxSfx);
            if (iSfx == 1 && (*parr_sSfx)[0].uiLength() <= 0)
            {
                iSfx = 0;
            }
            iMaxSfx += 2;
        }
        vInsertCommonSfx(parr_sSfx, iSfx, iSubtable, vecStems.size(), iCutRight, sLemmaEnding);

        delete[] arr_sStems;
        delete[] *parr_sSfx;
        delete parr_sSfx;

        // TEST
        //if (i_subtable > 100)
        //{
        //    break;
        //}
    }
    return 0;
}
Ejemplo n.º 14
0
ET_ReturnCode CLexeme::eLoadIrregularForms()
{
    ET_ReturnCode rc = H_NO_ERROR;

    if (!m_stProperties.bHasIrregularForms)
    {
        return H_FALSE;
    }

    m_stProperties.bHasIrregularVariants = false;

    CEString sQuery 
        (L"SELECT id, gram_hash, wordform, is_alternative FROM irregular_forms WHERE descriptor_id = ");
    sQuery += CEString::sToString (m_stProperties.iDbKey);
    sQuery += L";";

    CSqlite * pDb = NULL;

    m_mmapIrregularForms.clear();

    try
    {
        pDb = m_pDictionary->pGetDbHandle();            
        unsigned int uiQueryHandle = pDb->uiPrepareForSelect (sQuery);
        while (pDb->bGetRow(uiQueryHandle))
        {
            //StIrregularForm stForm;
            int iId = -1;
            int iHash = -1;
            CEString sForm;
            bool bIsVariant = false;
            pDb->GetData (0, iId, uiQueryHandle);
            pDb->GetData (1, iHash, uiQueryHandle);
            pDb->GetData (2, sForm, uiQueryHandle);
            pDb->GetData (3, bIsVariant, uiQueryHandle);

            if (bIsVariant)
            {
                m_stProperties.bHasIrregularVariants = true;
            }

            CEString sStressQuery (L"SELECT position, is_primary FROM irregular_stress WHERE form_id = ");
            sStressQuery += CEString::sToString (iId);
            sStressQuery += L";";

            CWordForm * pWf = new CWordForm(iHash);
            pWf->m_pLexeme = this;
            pWf->m_bIrregular = true;
            pWf->m_sWordForm = sForm;

            unsigned int uiStressHandle = pDb->uiPrepareForSelect (sStressQuery);
            while (pDb->bGetRow (uiStressHandle))
            {
                int iPos = -1;
                bool bPrimary = false;
                pDb->GetData (0, iPos, uiStressHandle);
                pDb->GetData (1, bPrimary, uiStressHandle);
                int iStressedSyll = sForm.uiGetSyllableFromVowelPos (iPos);
                pWf->m_mapStress[iStressedSyll] = bPrimary ? STRESS_PRIMARY : STRESS_SECONDARY;
            }
            pDb->Finalize (uiStressHandle);

            StIrregularForm stIf(pWf, bIsVariant);
            pair<int, StIrregularForm> pairHashToWordForm (iHash, stIf);
            m_mmapIrregularForms.insert (pairHashToWordForm);

        }   //  while (pDb->b_GetRow())

        pDb->Finalize(uiQueryHandle);
    }
    catch (CException ex)
    {
        ERROR_LOG (ex.szGetDescription());
        rc = H_EXCEPTION;
    }
    catch (...)
    {
        CEString sMsg;
        CEString sError;
        try
        {
            pDb->GetLastError (sError);
            sMsg += CEString (L", error %d: ");
            sMsg += sError;
        }
        catch (...)
        {
            sMsg = L"Apparent DB error ";
        }
    
        sMsg += CEString::sToString(pDb->iGetLastError());
        ERROR_LOG (sMsg);
        rc = H_EXCEPTION;
    }

    return rc;

}   //  eGetIrregularForms()