int CAnalyzer::iCheckEndings(vector<CHasher>& vecPossibleWordforms, vector<stStemLinks>& vecStems, CEString sLeft, CEString sRight, int iStressPosEnding) // If pvec_stems_id IS NOT empty: // For every stem in pvec_stems_id, take the corresponding endings table // and look whether it contains an ending equal to sRight; // for every such ending, add a wordform to vecPossibleWordforms. // // If pvec_stems_id IS empty: // Look for an ending equal to sRight; for every such ending, // build a wordform and store it in vecPossibleWordforms. // (Identical wordforms are stored as one wordform.) { if (m_pDb == NULL) // || vecStems == NULL) { return -1; } static vector<int> vecGram; CEString str_query, sLemma; vector<CEString> vecLemma; for (vector<stStemLinks>::iterator itStems = vecStems.begin(); itStems != vecStems.end(); itStems++) { // For each *itStems look up the endings table ID in DB, then in this table try to find // endings which are equal to sRight. For each ending found, write the parameters // to tmpWf and then push_back tmpWf to vecPossibleWordforms: vecGram.clear(); vecGram = arr_freq_endings[(*itStems).iEndingsLink].m_vecFind (sRight, iStressPosEnding); if (vecGram.empty()) { continue; } for (vector<int>::iterator iter_endings = vecGram.begin(); iter_endings != vecGram.end(); iter_endings++) { CHasher tmpWf; tmpWf.hDecodeHash(*iter_endings); tmpWf.m_llLexemeId = (*itStems).llLexemeId; tmpWf.m_sLemma = (*itStems).sLemma; //tmpWf.str_WordForm = sLeft + sRight; //h_AddClassifyingCategories(&tmpWf); vecPossibleWordforms.push_back (tmpWf); } vecLemma.clear(); // that vector is different for every stem found } if (vecStems.empty()) // Try to guess the lexeme { if (sLeft.uiLength() <= 2) { return 0; } vector<int> vec_i_possible_ETs; // pair<unordered_multimap<wstring, int>::iterator, // unordered_multimap<wstring, int>::iterator> pair_search_result = umap_endings2subtbl.equal_range((wstring)sRight); pair<multimap<CEString, int>::iterator, multimap<CEString, int>::iterator> pair_search_result = umap_endings2subtbl.equal_range(sRight); for (; pair_search_result.first != pair_search_result.second; ++pair_search_result.first) { vec_i_possible_ETs.push_back(pair_search_result.first->second); } for (vector<int>::iterator iter_ET = vec_i_possible_ETs.begin(); iter_ET != vec_i_possible_ETs.end(); ++iter_ET) { if (arr_freq_endings[*iter_ET].m_sStemFinal.uiLength() > 0 && // !regex_match(sLeft, (const wregex)(L"^.*(" + arr_freq_endings[*iter_ET].m_sStemFinale + L")$"))) !sLeft.bRegexMatch (L"^.*(" + arr_freq_endings[*iter_ET].m_sStemFinal + L")$")) { continue; } if (sLeft.uiLength() <= arr_freq_endings[*iter_ET].m_iCutRight) { continue; } vecGram.clear(); vecGram = arr_freq_endings[*iter_ET].m_vecFind(sRight, -2); if (vecGram.empty()) { continue; } for (vector<int>::iterator itHash = vecGram.begin(); itHash != vecGram.end(); ++itHash) { CHasher tmpWf; tmpWf.m_sLemma = sLeft.sSubstr(0, sLeft.uiLength() - arr_freq_endings[*iter_ET].m_iCutRight) + arr_freq_endings[*iter_ET].m_sLemmaFinal; if (!bIsValidLemma (tmpWf.m_sLemma)) { continue; } // Check if what we've found is a new wordform bool bExists = false; for (vector<CHasher>::iterator itWf = vecPossibleWordforms.begin(); itWf != vecPossibleWordforms.end(); ++itWf) { if ((*itWf).m_sLemma == tmpWf.m_sLemma && (*itWf).iGramHash() == *itHash) { bExists = true; } } if (!bExists) { tmpWf.hDecodeHash(*itHash); tmpWf.m_llLexemeId = 0; vecPossibleWordforms.push_back(tmpWf); } } } } vecGram.clear(); return 0; }
int CAnalyzer::iAnalyze(CEString sWordform, vector<CHasher>& vecPossibleWordforms, BOOL bGuess) { // Be careful: it changes the input if (sWordform.uiLength() <= 0 || m_pDb == NULL) // || vecPossibleWordforms == NULL) { return -1; } // Multiple stress marks?? CEString sWordformOriginal(sWordform); int iStressPosStem = -1, iStressPosEnding = -1; int iStressPos = sWordform.uiFindOneOf(0, L"<\u0301"); if (ecNotFound == iStressPos) { iStressPos = -1; } else { sWordform.sErase(iStressPos, 1); } /* wsmatch result; bool b_match = regex_match(wstring(sWordform), result, (const wregex)L"^([^<\u0301]*)([<\u0301])(.*)$"); if (b_match == true) { CEString sLeft = (CEString)result[1]; CEString str_delimiter = (CEString)result[2]; CEString sRight = (CEString)result[3]; sWordform = sLeft + sRight; if (str_delimiter[0] == L'<') // кор<ова { iStressPos = sLeft.length(); } else // коро\u0301ва { iStressPos = sLeft.length() - 1; } } else { iStressPos = -1; } */ wsmatch result; bool bMatch = sWordform.bRegexMatch(L"^([^<\u0301]*)([<\u0301])(.*)$"); if (bMatch == true) { CEString sLeft = sWordform.sGetRegexMatch(0); CEString sDelimiter = sWordform.sGetRegexMatch(1); CEString sRight = sWordform.sGetRegexMatch(2); sWordform = sLeft + sRight; if (sDelimiter[0] == L'<') // кор<ова { iStressPos = sLeft.uiLength(); } else // коро\u0301ва { iStressPos = sLeft.uiLength() - 1; } } else { iStressPos = -1; } CEString sLeft, sRight; vector<stStemLinks> vecStems; vecPossibleWordforms.clear(); for (int iLeft = sWordform.uiLength(); iLeft >= 0; --iLeft) { sLeft = sWordform.sSubstr(0, iLeft); sRight = sWordform.sSubstr(iLeft, sWordform.uiLength() - iLeft); // Stress positions for the stem and the ending if (iStressPos == -1) { iStressPosStem = iStressPosEnding = -2; } else if (iStressPos >= sLeft.uiLength()) { iStressPosStem = -1; iStressPosEnding = iStressPos - sLeft.uiLength(); } else { iStressPosStem = iStressPos; iStressPosEnding = -1; } vecStems.clear(); iLookUpStems(vecStems, sLeft, iStressPosStem); if (vecStems.empty()) { continue; } iCheckEndings(vecPossibleWordforms, vecStems, sLeft, sRight, iStressPosEnding); } // If we have no result, try cutting of possible prefixes if (vecPossibleWordforms.empty()) { for (int iLeft = min(sWordformOriginal.uiLength(), 4); iLeft >= 1; --iLeft) { sLeft = sWordformOriginal.sSubstr(0, iLeft); sRight = sWordformOriginal.sSubstr(iLeft, sWordformOriginal.uiLength() - iLeft); if (iLeft == 4) { if (sLeft == L"пол-") { int iResult = iAnalyze(sRight, vecPossibleWordforms, bGuess); if (iResult > 0) { for (int iWf = iResult - 1; iWf >= 0; --iWf) { if (vecPossibleWordforms[iWf].m_ePos != POS_NOUN || vecPossibleWordforms[iWf].m_eNumber != NUM_SG || vecPossibleWordforms[iWf].m_eCase != CASE_GEN) { vecPossibleWordforms.erase(vecPossibleWordforms.begin() + iWf); } } if (vecPossibleWordforms.size() > 0) { return vecPossibleWordforms.size(); } } } } else if (iLeft == 3) { if (sLeft == L"пол") { int iResult = iAnalyze(sRight, vecPossibleWordforms, bGuess); if (iResult > 0) { for (int iWf = iResult - 1; iWf >= 0; --iWf) { if (vecPossibleWordforms[iWf].m_ePos != POS_NOUN || vecPossibleWordforms[iWf].m_eNumber != NUM_SG || vecPossibleWordforms[iWf].m_eCase != CASE_GEN) { vecPossibleWordforms.erase(vecPossibleWordforms.begin() + iWf); } } if (vecPossibleWordforms.size() > 0) { return vecPossibleWordforms.size(); } } } } else if (iLeft == 2) { if (sLeft == L"не") { int iResult = iAnalyze(sRight, vecPossibleWordforms, bGuess); if (iResult > 0) { for (int iWf = iResult - 1; iWf >= 0; --iWf) { if (vecPossibleWordforms[iWf].m_ePos != POS_NOUN || vecPossibleWordforms[iWf].m_ePos != POS_ADJ || vecPossibleWordforms[iWf].m_ePos != POS_VERB || (vecPossibleWordforms[iWf].m_ePos == POS_VERB && (vecPossibleWordforms[iWf].m_eSubparadigm == SUBPARADIGM_PRESENT_TENSE || vecPossibleWordforms[iWf].m_eSubparadigm == SUBPARADIGM_PAST_TENSE || vecPossibleWordforms[iWf].m_eSubparadigm == SUBPARADIGM_INFINITIVE))) { vecPossibleWordforms.erase(vecPossibleWordforms.begin() + iWf); } } if (vecPossibleWordforms.size() > 0) { return vecPossibleWordforms.size(); } } } } } } // Now, if we haven't found anything, we may guess the lexeme if (vecPossibleWordforms.empty() && bGuess == TRUE) { for (int iLeft = 0; iLeft <= sWordform.uiLength(); ++iLeft) { sLeft = sWordform.sSubstr (0, iLeft); sRight = sWordform.sSubstr (iLeft, sWordform.uiLength() - iLeft); // Stress positions for the stem and the ending if (iStressPos == -1) { iStressPosStem = iStressPosEnding = -2; } else if (iStressPos >= sLeft.uiLength()) { iStressPosStem = -1; iStressPosEnding = iStressPos - sLeft.uiLength(); } else { iStressPosStem = iStressPos; iStressPosEnding = -1; } vecStems.clear(); iCheckEndings (vecPossibleWordforms, vecStems, sLeft, sRight, iStressPosEnding); if ((bContainsPlausibleVariants (vecPossibleWordforms) && sRight.uiLength() <= 3) || vecPossibleWordforms.size() >= 4) { break; } } if (vecPossibleWordforms.size() > 4) { LeaveMostPlausible (vecPossibleWordforms); } } return vecPossibleWordforms.size(); }