Exemplo n.º 1
0
int CAnalyzer::iCheckEndings(vector<CHasher>& vecPossibleWordforms,
                             vector<stStemLinks>& vecStems,
                             CEString sLeft,
                             CEString sRight,
                             int iStressPosEnding)
// If pvec_stems_id IS NOT empty:
// For every stem in pvec_stems_id, take the corresponding endings table
// and look whether it contains an ending equal to sRight;
// for every such ending, add a wordform to vecPossibleWordforms.
//
// If pvec_stems_id IS empty:
// Look for an ending equal to sRight; for every such ending,
// build a wordform and store it in vecPossibleWordforms.
// (Identical wordforms are stored as one wordform.)
{
    if (m_pDb == NULL) // || vecStems == NULL)
    {
        return -1;
    }
    static vector<int> vecGram;
    CEString str_query, sLemma;
    vector<CEString> vecLemma;

    for (vector<stStemLinks>::iterator itStems = vecStems.begin();
        itStems != vecStems.end(); itStems++)
    {
        // For each *itStems look up the endings table ID in DB, then in this table try to find
        // endings which are equal to sRight. For each ending found, write the parameters
        // to tmpWf and then push_back tmpWf to vecPossibleWordforms:
        vecGram.clear();
        vecGram = arr_freq_endings[(*itStems).iEndingsLink].m_vecFind (sRight, iStressPosEnding);
        if (vecGram.empty())
        {
            continue;
        }
        for (vector<int>::iterator iter_endings = vecGram.begin();
            iter_endings != vecGram.end(); iter_endings++)
        {
            CHasher tmpWf;
            tmpWf.hDecodeHash(*iter_endings);
            tmpWf.m_llLexemeId = (*itStems).llLexemeId;
            tmpWf.m_sLemma = (*itStems).sLemma;
            //tmpWf.str_WordForm = sLeft + sRight;
            //h_AddClassifyingCategories(&tmpWf);
            vecPossibleWordforms.push_back (tmpWf);
        }
        vecLemma.clear(); // that vector is different for every stem found
    }

    if (vecStems.empty())
    // Try to guess the lexeme
    {
        if (sLeft.uiLength() <= 2)
        {
            return 0;
        }
        vector<int> vec_i_possible_ETs;
//        pair<unordered_multimap<wstring, int>::iterator,
//             unordered_multimap<wstring, int>::iterator> pair_search_result = umap_endings2subtbl.equal_range((wstring)sRight);
        pair<multimap<CEString, int>::iterator,
             multimap<CEString, int>::iterator> pair_search_result = umap_endings2subtbl.equal_range(sRight);
        for (; pair_search_result.first != pair_search_result.second; ++pair_search_result.first)
        {
            vec_i_possible_ETs.push_back(pair_search_result.first->second);
        }
        for (vector<int>::iterator iter_ET = vec_i_possible_ETs.begin();
             iter_ET != vec_i_possible_ETs.end();
             ++iter_ET)
        {
            if (arr_freq_endings[*iter_ET].m_sStemFinal.uiLength() > 0 &&
//                !regex_match(sLeft, (const wregex)(L"^.*(" + arr_freq_endings[*iter_ET].m_sStemFinale + L")$")))
                !sLeft.bRegexMatch (L"^.*(" + arr_freq_endings[*iter_ET].m_sStemFinal + L")$"))
            {
                continue;
            }
            if (sLeft.uiLength() <= arr_freq_endings[*iter_ET].m_iCutRight)
            {
                continue;
            }
            vecGram.clear();
            vecGram = arr_freq_endings[*iter_ET].m_vecFind(sRight, -2);
            if (vecGram.empty())
            {
                continue;
            }
            for (vector<int>::iterator itHash = vecGram.begin();
                 itHash != vecGram.end(); ++itHash)
            {
                CHasher tmpWf;
                tmpWf.m_sLemma = sLeft.sSubstr(0, sLeft.uiLength() - arr_freq_endings[*iter_ET].m_iCutRight) + arr_freq_endings[*iter_ET].m_sLemmaFinal;
                if (!bIsValidLemma (tmpWf.m_sLemma))
                {
                    continue;
                }
                // Check if what we've found is a new wordform
                bool bExists = false;
                for (vector<CHasher>::iterator itWf = vecPossibleWordforms.begin();
                     itWf != vecPossibleWordforms.end();
                     ++itWf)
                {
                    if ((*itWf).m_sLemma == tmpWf.m_sLemma && (*itWf).iGramHash() == *itHash)
                    {
                        bExists = true;
                    }
                }
                if (!bExists)
                {
                    tmpWf.hDecodeHash(*itHash);
                    tmpWf.m_llLexemeId = 0;
                    vecPossibleWordforms.push_back(tmpWf);
                }
            }
        }
    }
    vecGram.clear();
    return 0;
}
Exemplo n.º 2
0
int CAnalyzer::iAnalyze(CEString sWordform,
                        vector<CHasher>& vecPossibleWordforms,
                        BOOL bGuess)
{
    // Be careful: it changes the input
    if (sWordform.uiLength() <= 0 || m_pDb == NULL) // || vecPossibleWordforms == NULL)
    {
        return -1;
    }

// Multiple stress marks??

    CEString sWordformOriginal(sWordform);
    int iStressPosStem = -1, iStressPosEnding = -1;
    int iStressPos = sWordform.uiFindOneOf(0, L"<\u0301");
    if (ecNotFound == iStressPos)
    {
        iStressPos = -1;
    }
    else
    {
        sWordform.sErase(iStressPos, 1);
    }

/*
    wsmatch result;
    bool b_match = regex_match(wstring(sWordform), result, (const wregex)L"^([^<\u0301]*)([<\u0301])(.*)$");
    if (b_match == true)
    {
        CEString sLeft = (CEString)result[1];
        CEString str_delimiter = (CEString)result[2];
        CEString sRight = (CEString)result[3];
        sWordform = sLeft + sRight;
        if (str_delimiter[0] == L'<')       // кор<ова
        {
            iStressPos = sLeft.length();
        }
        else                                // коро\u0301ва
        {
            iStressPos = sLeft.length() - 1;
        }
    }
    else
    {
        iStressPos = -1;
    }
*/

    wsmatch result;
    bool bMatch = sWordform.bRegexMatch(L"^([^<\u0301]*)([<\u0301])(.*)$");
    if (bMatch == true)
    {
        CEString sLeft = sWordform.sGetRegexMatch(0);
        CEString sDelimiter = sWordform.sGetRegexMatch(1);
        CEString sRight = sWordform.sGetRegexMatch(2);
        sWordform = sLeft + sRight;
        if (sDelimiter[0] == L'<')       // кор<ова
        {
            iStressPos = sLeft.uiLength();
        }
        else                                // коро\u0301ва
        {
            iStressPos = sLeft.uiLength() - 1;
        }
    }
    else
    {
        iStressPos = -1;
    }

    CEString sLeft, sRight;
    vector<stStemLinks> vecStems;
    vecPossibleWordforms.clear();
    for (int iLeft = sWordform.uiLength(); iLeft >= 0; --iLeft)
    {
        sLeft = sWordform.sSubstr(0, iLeft);
        sRight = sWordform.sSubstr(iLeft, sWordform.uiLength() - iLeft);

        // Stress positions for the stem and the ending
        if (iStressPos == -1)
        {
            iStressPosStem = iStressPosEnding = -2;
        }
        else if (iStressPos >= sLeft.uiLength())
        {
            iStressPosStem = -1;
            iStressPosEnding = iStressPos - sLeft.uiLength();
        }
        else
        {
            iStressPosStem = iStressPos;
            iStressPosEnding = -1;
        }
        
        vecStems.clear();
        iLookUpStems(vecStems, sLeft, iStressPosStem);
        if (vecStems.empty())
        {
            continue;
        }
        iCheckEndings(vecPossibleWordforms, vecStems, sLeft, sRight, iStressPosEnding);
    }

    // If we have no result, try cutting of possible prefixes
    if (vecPossibleWordforms.empty())
    {
        for (int iLeft = min(sWordformOriginal.uiLength(), 4); iLeft >= 1; --iLeft)
        {
            sLeft = sWordformOriginal.sSubstr(0, iLeft);
            sRight = sWordformOriginal.sSubstr(iLeft, sWordformOriginal.uiLength() - iLeft);
            if (iLeft == 4)
            {
                if (sLeft == L"пол-")
                {
                    int iResult = iAnalyze(sRight, vecPossibleWordforms, bGuess);
                    if (iResult > 0)
                    {
                        for (int iWf = iResult - 1; iWf >= 0; --iWf)
                        {
                            if (vecPossibleWordforms[iWf].m_ePos != POS_NOUN ||
                                vecPossibleWordforms[iWf].m_eNumber != NUM_SG ||
                                vecPossibleWordforms[iWf].m_eCase != CASE_GEN)
                            {
                                vecPossibleWordforms.erase(vecPossibleWordforms.begin() + iWf);
                            }
                        }
                        if (vecPossibleWordforms.size() > 0)
                        {
                            return vecPossibleWordforms.size();
                        }
                    }
                }
            }
            else if (iLeft == 3)
            {
                if (sLeft == L"пол")
                {
                    int iResult = iAnalyze(sRight, vecPossibleWordforms, bGuess);
                    if (iResult > 0)
                    {
                        for (int iWf = iResult - 1; iWf >= 0; --iWf)
                        {
                            if (vecPossibleWordforms[iWf].m_ePos != POS_NOUN ||
                                vecPossibleWordforms[iWf].m_eNumber != NUM_SG ||
                                vecPossibleWordforms[iWf].m_eCase != CASE_GEN)
                            {
                                vecPossibleWordforms.erase(vecPossibleWordforms.begin() + iWf);
                            }
                        }
                        if (vecPossibleWordforms.size() > 0)
                        {
                            return vecPossibleWordforms.size();
                        }
                    }
                }
            }
            else if (iLeft == 2)
            {
                if (sLeft == L"не")
                {
                    int iResult = iAnalyze(sRight, vecPossibleWordforms, bGuess);
                    if (iResult > 0)
                    {
                        for (int iWf = iResult - 1; iWf >= 0; --iWf)
                        {
                            if (vecPossibleWordforms[iWf].m_ePos != POS_NOUN ||
                                vecPossibleWordforms[iWf].m_ePos != POS_ADJ ||
                                vecPossibleWordforms[iWf].m_ePos != POS_VERB ||
                                (vecPossibleWordforms[iWf].m_ePos == POS_VERB &&
                                 (vecPossibleWordforms[iWf].m_eSubparadigm == SUBPARADIGM_PRESENT_TENSE ||
                                  vecPossibleWordforms[iWf].m_eSubparadigm == SUBPARADIGM_PAST_TENSE ||
                                  vecPossibleWordforms[iWf].m_eSubparadigm == SUBPARADIGM_INFINITIVE)))
                            {
                                vecPossibleWordforms.erase(vecPossibleWordforms.begin() + iWf);
                            }
                        }
                        if (vecPossibleWordforms.size() > 0)
                        {
                            return vecPossibleWordforms.size();
                        }
                    }
                }
            }
        }
    }
    // Now, if we haven't found anything, we may guess the lexeme
    if (vecPossibleWordforms.empty() && bGuess == TRUE)
    {
        for (int iLeft = 0; iLeft <= sWordform.uiLength(); ++iLeft)
        {
            sLeft = sWordform.sSubstr (0, iLeft);
            sRight = sWordform.sSubstr (iLeft, sWordform.uiLength() - iLeft);

            // Stress positions for the stem and the ending
            if (iStressPos == -1)
            {
                iStressPosStem = iStressPosEnding = -2;
            }
            else if (iStressPos >= sLeft.uiLength())
            {
                iStressPosStem = -1;
                iStressPosEnding = iStressPos - sLeft.uiLength();
            }
            else
            {
                iStressPosStem = iStressPos;
                iStressPosEnding = -1;
            }
            vecStems.clear();
            iCheckEndings (vecPossibleWordforms, vecStems, sLeft, sRight, iStressPosEnding);
            if ((bContainsPlausibleVariants (vecPossibleWordforms) && sRight.uiLength() <= 3) ||
                vecPossibleWordforms.size() >= 4)
            {
                break;
            }
        }
        if (vecPossibleWordforms.size() > 4)
        {
            LeaveMostPlausible (vecPossibleWordforms);
        }
    }
    return vecPossibleWordforms.size();
}