CEString CT_LexPreprocessor::sInsertStress(int iLetter, CEString s_) { if (iLetter >= 0 && iLetter < (int)(s_.uiLength())) { return s_.sSubstr(0, iLetter) + L"<" + s_.sSubstr(iLetter, s_.uiLength() - iLetter); } return s_; }
bool CAnalyzer::bIsValidLemma(CEString sWf) { // if (!regex_search(sWf, (const wregex)(L"[аеёиоуыэюяАЕЁИОУЫЭЮЯ]"))) if (!sWf.bRegexSearch (L"[аеёиоуыэюяАЕЁИОУЫЭЮЯ]")) { return false; } // if (regex_search(sWf, (const wregex)(L"[аеёиоуыэюяъь][ьъ]"))) if (sWf.bRegexSearch (L"[аеёиоуыэюяъь][ьъ]")) { return false; } // if (regex_search(sWf, (const wregex)(L"ъ[аоуыэи]"))) if (sWf.bRegexSearch (L"ъ[аоуыэи]")) { return false; } // if (regex_search(sWf, (const wregex)(L"[аоэуе][аоэуы]ть$"))) if (sWf.bRegexSearch (L"[аоэуе][аоэуы]ть$")) { return false; } // if (regex_search(sWf, (const wregex)(L"[кгхц]ь$"))) if (sWf.bRegexSearch (L"[кгхц]ь$")) { return false; } // if (regex_search(sWf, (const wregex)(L"[кгх]ый$"))) if (sWf.bRegexSearch (L"[кгх]ый$")) { return false; } // if (regex_search(sWf, (const wregex)(L"[жчшщ]ы"))) if (sWf.bRegexSearch (L"[жчшщ]ы")) { return false; } // if (regex_search(sWf, (const wregex)(L"ы$"))) if (sWf.bRegexSearch (L"ы$")) { return false; } // if (regex_search(sWf, (const wregex)(L"[бвгджзклмнпрстфхцчшщ](й|ъ$)"))) if (sWf.bRegexSearch (L"[бвгджзклмнпрстфхцчшщ](й|ъ$)")) { return false; } // if (regex_search(sWf, (const wregex)(L"[бвгджзклмнпрстфхцчшщ]{4}$"))) if (sWf.bRegexSearch (L"[бвгджзклмнпрстфхцчшщ]{4}$")) { return false; } return true; }
int CT_LexPreprocessor::iDeleteStress(CEString& s_) // Deletes the stress and returns the number of the letter it follows { unsigned int uiStressPos = s_.uiFind(L"<"); if (uiStressPos >= s_.uiLength()) { return -1; } s_ = s_.sSubstr(0, uiStressPos) + s_.sSubstr(uiStressPos + 1, s_.uiLength() - uiStressPos - 1); return (int)uiStressPos; }
ET_ReturnCode CLexeme::eGetAlternatingPreverb (const CEString& sVerbForm, CEString& sPreverb, bool& bVoicing) { if (!m_stProperties.bFleetingVowel) { return H_NO_MORE; } // types 5, 6, 7, 8, 9, 11, 14 bool bPreverb = false; vector<CEString>::iterator itP = m_vecAlternatingPreverbs.begin(); for (; itP != m_vecAlternatingPreverbs.end()&&!bPreverb; ++itP) { if (sVerbForm.bStartsWith (*itP)) { sPreverb = *itP; bPreverb = true; } } if (!bPreverb) { itP = m_vecAlternatingPreverbsWithVoicing.begin(); for (; itP != m_vecAlternatingPreverbsWithVoicing.end()&&!bPreverb; ++itP) { if (sVerbForm.bStartsWith (*itP)) { sPreverb = *itP; bPreverb = true; bVoicing = true; } } } if (!bPreverb) { return H_FALSE; } if (sVerbForm.uiLength() < sPreverb.uiLength() + 2) { ASSERT(0); ERROR_LOG (L"Stem too short."); return H_ERROR_INVALID_ARG; } return H_NO_ERROR; } // eGetAlternatingPreverb(...)
ET_ReturnCode CFormBuilderPronounAdj::eGetStressPositions (const CEString& sEnding, ET_StressLocation eStressType, vector<int>& vecStressPos) { ET_ReturnCode rc = H_NO_ERROR; // CEString sLemma (sLemma); m_sLemma.SetVowels (g_szRusVowels); int iStressPos = -1; if (STRESS_LOCATION_STEM == eStressType) { rc = eGetStemStressPositions (m_sLemma, m_eSubparadigm, vecStressPos); if (rc != H_NO_ERROR) { return rc; } } else if (STRESS_LOCATION_ENDING == eStressType) { if (sEnding.uiNSyllables() < 1) { iStressPos = m_sLemma.uiNSyllables() - 1; } else { if (L"мс-п" == m_pLexeme->sInflectionType() && (L"его" == sEnding || L"ему" == sEnding || L"ого" == sEnding || L"ому" == sEnding)) { iStressPos = m_sLemma.uiNSyllables() + 1; // одног<о, твоем<у } else { iStressPos = m_sLemma.uiNSyllables(); } } // hGetEndingStressPosition (str_Lemma, sEnding, i_stressPos); vecStressPos.push_back (iStressPos); } else { ASSERT(0); ERROR_LOG (L"Illegal stress type."); return H_ERROR_INVALID_ARG; } return rc; } // eGetStressPositions (...)
int CAnalyzer::iCheckEndings(vector<CHasher>& vecPossibleWordforms, vector<stStemLinks>& vecStems, CEString sLeft, CEString sRight, int iStressPosEnding) // If pvec_stems_id IS NOT empty: // For every stem in pvec_stems_id, take the corresponding endings table // and look whether it contains an ending equal to sRight; // for every such ending, add a wordform to vecPossibleWordforms. // // If pvec_stems_id IS empty: // Look for an ending equal to sRight; for every such ending, // build a wordform and store it in vecPossibleWordforms. // (Identical wordforms are stored as one wordform.) { if (m_pDb == NULL) // || vecStems == NULL) { return -1; } static vector<int> vecGram; CEString str_query, sLemma; vector<CEString> vecLemma; for (vector<stStemLinks>::iterator itStems = vecStems.begin(); itStems != vecStems.end(); itStems++) { // For each *itStems look up the endings table ID in DB, then in this table try to find // endings which are equal to sRight. For each ending found, write the parameters // to tmpWf and then push_back tmpWf to vecPossibleWordforms: vecGram.clear(); vecGram = arr_freq_endings[(*itStems).iEndingsLink].m_vecFind (sRight, iStressPosEnding); if (vecGram.empty()) { continue; } for (vector<int>::iterator iter_endings = vecGram.begin(); iter_endings != vecGram.end(); iter_endings++) { CHasher tmpWf; tmpWf.hDecodeHash(*iter_endings); tmpWf.m_llLexemeId = (*itStems).llLexemeId; tmpWf.m_sLemma = (*itStems).sLemma; //tmpWf.str_WordForm = sLeft + sRight; //h_AddClassifyingCategories(&tmpWf); vecPossibleWordforms.push_back (tmpWf); } vecLemma.clear(); // that vector is different for every stem found } if (vecStems.empty()) // Try to guess the lexeme { if (sLeft.uiLength() <= 2) { return 0; } vector<int> vec_i_possible_ETs; // pair<unordered_multimap<wstring, int>::iterator, // unordered_multimap<wstring, int>::iterator> pair_search_result = umap_endings2subtbl.equal_range((wstring)sRight); pair<multimap<CEString, int>::iterator, multimap<CEString, int>::iterator> pair_search_result = umap_endings2subtbl.equal_range(sRight); for (; pair_search_result.first != pair_search_result.second; ++pair_search_result.first) { vec_i_possible_ETs.push_back(pair_search_result.first->second); } for (vector<int>::iterator iter_ET = vec_i_possible_ETs.begin(); iter_ET != vec_i_possible_ETs.end(); ++iter_ET) { if (arr_freq_endings[*iter_ET].m_sStemFinal.uiLength() > 0 && // !regex_match(sLeft, (const wregex)(L"^.*(" + arr_freq_endings[*iter_ET].m_sStemFinale + L")$"))) !sLeft.bRegexMatch (L"^.*(" + arr_freq_endings[*iter_ET].m_sStemFinal + L")$")) { continue; } if (sLeft.uiLength() <= arr_freq_endings[*iter_ET].m_iCutRight) { continue; } vecGram.clear(); vecGram = arr_freq_endings[*iter_ET].m_vecFind(sRight, -2); if (vecGram.empty()) { continue; } for (vector<int>::iterator itHash = vecGram.begin(); itHash != vecGram.end(); ++itHash) { CHasher tmpWf; tmpWf.m_sLemma = sLeft.sSubstr(0, sLeft.uiLength() - arr_freq_endings[*iter_ET].m_iCutRight) + arr_freq_endings[*iter_ET].m_sLemmaFinal; if (!bIsValidLemma (tmpWf.m_sLemma)) { continue; } // Check if what we've found is a new wordform bool bExists = false; for (vector<CHasher>::iterator itWf = vecPossibleWordforms.begin(); itWf != vecPossibleWordforms.end(); ++itWf) { if ((*itWf).m_sLemma == tmpWf.m_sLemma && (*itWf).iGramHash() == *itHash) { bExists = true; } } if (!bExists) { tmpWf.hDecodeHash(*itHash); tmpWf.m_llLexemeId = 0; vecPossibleWordforms.push_back(tmpWf); } } } } vecGram.clear(); return 0; }
int CAnalyzer::iAnalyze(CEString sWordform, vector<CHasher>& vecPossibleWordforms, BOOL bGuess) { // Be careful: it changes the input if (sWordform.uiLength() <= 0 || m_pDb == NULL) // || vecPossibleWordforms == NULL) { return -1; } // Multiple stress marks?? CEString sWordformOriginal(sWordform); int iStressPosStem = -1, iStressPosEnding = -1; int iStressPos = sWordform.uiFindOneOf(0, L"<\u0301"); if (ecNotFound == iStressPos) { iStressPos = -1; } else { sWordform.sErase(iStressPos, 1); } /* wsmatch result; bool b_match = regex_match(wstring(sWordform), result, (const wregex)L"^([^<\u0301]*)([<\u0301])(.*)$"); if (b_match == true) { CEString sLeft = (CEString)result[1]; CEString str_delimiter = (CEString)result[2]; CEString sRight = (CEString)result[3]; sWordform = sLeft + sRight; if (str_delimiter[0] == L'<') // кор<ова { iStressPos = sLeft.length(); } else // коро\u0301ва { iStressPos = sLeft.length() - 1; } } else { iStressPos = -1; } */ wsmatch result; bool bMatch = sWordform.bRegexMatch(L"^([^<\u0301]*)([<\u0301])(.*)$"); if (bMatch == true) { CEString sLeft = sWordform.sGetRegexMatch(0); CEString sDelimiter = sWordform.sGetRegexMatch(1); CEString sRight = sWordform.sGetRegexMatch(2); sWordform = sLeft + sRight; if (sDelimiter[0] == L'<') // кор<ова { iStressPos = sLeft.uiLength(); } else // коро\u0301ва { iStressPos = sLeft.uiLength() - 1; } } else { iStressPos = -1; } CEString sLeft, sRight; vector<stStemLinks> vecStems; vecPossibleWordforms.clear(); for (int iLeft = sWordform.uiLength(); iLeft >= 0; --iLeft) { sLeft = sWordform.sSubstr(0, iLeft); sRight = sWordform.sSubstr(iLeft, sWordform.uiLength() - iLeft); // Stress positions for the stem and the ending if (iStressPos == -1) { iStressPosStem = iStressPosEnding = -2; } else if (iStressPos >= sLeft.uiLength()) { iStressPosStem = -1; iStressPosEnding = iStressPos - sLeft.uiLength(); } else { iStressPosStem = iStressPos; iStressPosEnding = -1; } vecStems.clear(); iLookUpStems(vecStems, sLeft, iStressPosStem); if (vecStems.empty()) { continue; } iCheckEndings(vecPossibleWordforms, vecStems, sLeft, sRight, iStressPosEnding); } // If we have no result, try cutting of possible prefixes if (vecPossibleWordforms.empty()) { for (int iLeft = min(sWordformOriginal.uiLength(), 4); iLeft >= 1; --iLeft) { sLeft = sWordformOriginal.sSubstr(0, iLeft); sRight = sWordformOriginal.sSubstr(iLeft, sWordformOriginal.uiLength() - iLeft); if (iLeft == 4) { if (sLeft == L"пол-") { int iResult = iAnalyze(sRight, vecPossibleWordforms, bGuess); if (iResult > 0) { for (int iWf = iResult - 1; iWf >= 0; --iWf) { if (vecPossibleWordforms[iWf].m_ePos != POS_NOUN || vecPossibleWordforms[iWf].m_eNumber != NUM_SG || vecPossibleWordforms[iWf].m_eCase != CASE_GEN) { vecPossibleWordforms.erase(vecPossibleWordforms.begin() + iWf); } } if (vecPossibleWordforms.size() > 0) { return vecPossibleWordforms.size(); } } } } else if (iLeft == 3) { if (sLeft == L"пол") { int iResult = iAnalyze(sRight, vecPossibleWordforms, bGuess); if (iResult > 0) { for (int iWf = iResult - 1; iWf >= 0; --iWf) { if (vecPossibleWordforms[iWf].m_ePos != POS_NOUN || vecPossibleWordforms[iWf].m_eNumber != NUM_SG || vecPossibleWordforms[iWf].m_eCase != CASE_GEN) { vecPossibleWordforms.erase(vecPossibleWordforms.begin() + iWf); } } if (vecPossibleWordforms.size() > 0) { return vecPossibleWordforms.size(); } } } } else if (iLeft == 2) { if (sLeft == L"не") { int iResult = iAnalyze(sRight, vecPossibleWordforms, bGuess); if (iResult > 0) { for (int iWf = iResult - 1; iWf >= 0; --iWf) { if (vecPossibleWordforms[iWf].m_ePos != POS_NOUN || vecPossibleWordforms[iWf].m_ePos != POS_ADJ || vecPossibleWordforms[iWf].m_ePos != POS_VERB || (vecPossibleWordforms[iWf].m_ePos == POS_VERB && (vecPossibleWordforms[iWf].m_eSubparadigm == SUBPARADIGM_PRESENT_TENSE || vecPossibleWordforms[iWf].m_eSubparadigm == SUBPARADIGM_PAST_TENSE || vecPossibleWordforms[iWf].m_eSubparadigm == SUBPARADIGM_INFINITIVE))) { vecPossibleWordforms.erase(vecPossibleWordforms.begin() + iWf); } } if (vecPossibleWordforms.size() > 0) { return vecPossibleWordforms.size(); } } } } } } // Now, if we haven't found anything, we may guess the lexeme if (vecPossibleWordforms.empty() && bGuess == TRUE) { for (int iLeft = 0; iLeft <= sWordform.uiLength(); ++iLeft) { sLeft = sWordform.sSubstr (0, iLeft); sRight = sWordform.sSubstr (iLeft, sWordform.uiLength() - iLeft); // Stress positions for the stem and the ending if (iStressPos == -1) { iStressPosStem = iStressPosEnding = -2; } else if (iStressPos >= sLeft.uiLength()) { iStressPosStem = -1; iStressPosEnding = iStressPos - sLeft.uiLength(); } else { iStressPosStem = iStressPos; iStressPosEnding = -1; } vecStems.clear(); iCheckEndings (vecPossibleWordforms, vecStems, sLeft, sRight, iStressPosEnding); if ((bContainsPlausibleVariants (vecPossibleWordforms) && sRight.uiLength() <= 3) || vecPossibleWordforms.size() >= 4) { break; } } if (vecPossibleWordforms.size() > 4) { LeaveMostPlausible (vecPossibleWordforms); } } return vecPossibleWordforms.size(); }
ET_ReturnCode CFormBuilderNouns::eBuild() { ASSERT(m_pLexeme); // we assume base class ctor took care of this ET_ReturnCode rc = H_NO_ERROR; m_pEndings = new CNounEndings(m_pLexeme); if (NULL == m_pEndings) { return H_ERROR_POINTER; } if (rc != H_NO_ERROR) { return rc; } ET_Animacy eAnimacy = m_pLexeme->eAnimacy(); ET_Gender eoGender = m_pLexeme->eGender(); CHasher gramIterator; gramIterator.Initialize(eoGender, eAnimacy); do { if ((L"мн." == m_pLexeme->sMainSymbol()) && (gramIterator.m_eNumber == NUM_SG)) { continue; } if (NUM_PL == gramIterator.m_eNumber && (CASE_PART == gramIterator.m_eCase || CASE_LOC == gramIterator.m_eCase)) { continue; } if (CASE_PART == gramIterator.m_eCase && !m_pLexeme->bSecondGenitive()) { continue; } if (CASE_LOC == gramIterator.m_eCase && !m_pLexeme->bSecondLocative()) { continue; } // Handle acc ending ET_Case eEndingCase = gramIterator.m_eCase; if (CASE_ACC == gramIterator.m_eCase) { rc = eHandleAccEnding (gramIterator.m_eNumber, eEndingCase); if (rc != H_NO_ERROR) { return rc; } } if (CASE_PART == gramIterator.m_eCase || CASE_LOC == gramIterator.m_eCase) { eEndingCase = CASE_DAT; } CEString sLemma (m_pLexeme->sGraphicStem()); if (m_pLexeme->bHasIrregularForms()) { bool bSkipRegular = false; rc = eCheckIrregularForms (gramIterator.m_eGender, gramIterator.m_eAnimacy, gramIterator.m_eCase, eEndingCase, gramIterator.m_eNumber, bSkipRegular); if (rc != H_NO_ERROR) { return rc; } if (bSkipRegular) { // Workaround for lack of "исх. форма иррег." mark in current source if (GENDER_M == gramIterator.m_eGender && NUM_SG == gramIterator.m_eNumber && CASE_NOM == gramIterator.m_eCase) { m_bIrregularSourceForm = true; } continue; } } rc = eHandleStemAugment (sLemma, gramIterator.m_eNumber, gramIterator.m_eCase); if (rc != H_NO_ERROR) { return rc; } ET_StressLocation eStress = STRESS_LOCATION_UNDEFINED; if (CASE_LOC == gramIterator.m_eCase) { eStress = STRESS_LOCATION_ENDING; } else { rc = eGetStressType (gramIterator.m_eNumber, eEndingCase, eStress); if (rc != H_NO_ERROR) { return rc; } } ((CNounEndings *)m_pEndings)->eSelect(gramIterator.m_eNumber, eEndingCase, eStress); int iNumEndings = m_pEndings->iCount(); if (iNumEndings < 1) { if (m_pLexeme->iType() != 0) { ASSERT(0); ERROR_LOG(L"No endings"); } continue; } CEString sSavedLemma (sLemma); // lemma can change, e.g. because of a fleetimg vowel for (int iEnding = 0; iEnding < iNumEndings; ++iEnding, sLemma = sSavedLemma) { // Get ending and modify as necessary CEString sEnding; unsigned __int64 llEndingKey = -1; rc = m_pEndings->eGetEnding(iEnding, sEnding, llEndingKey); if (rc != H_NO_ERROR) { return rc; } if (8 == m_pLexeme->iType() && GENDER_N != m_pLexeme->eGender()) { if (sLemma.bEndsWithOneOf (L"шжчщц")) { if (sEnding.bStartsWith (L"я")) { continue; } } else { if (sEnding.bStartsWith (L"а")) { continue; } } } bool bHasFleetingVowel = false; rc = eFleetingVowelCheck (gramIterator.m_eNumber, eEndingCase, gramIterator.m_eGender, eStress, SUBPARADIGM_NOUN, sEnding, sLemma); if (rc != H_NO_ERROR) { continue; } vector<int> vecStress; rc = eGetStressPositions (sLemma, sEnding, eStress, vecStress); if (rc != H_NO_ERROR) { continue; } CWordForm * pWordForm = NULL; rc = eCreateFormTemplate (gramIterator.m_eNumber, gramIterator.m_eCase, sLemma, pWordForm); if (rc != H_NO_ERROR) { continue; } if (1 == vecStress.size() || m_pLexeme->bIsMultistressedCompound()) { vector<int>::iterator itStressPos = vecStress.begin(); for (; itStressPos != vecStress.end(); ++itStressPos) { pWordForm->m_mapStress[*itStressPos] = STRESS_PRIMARY; // primary rc = eHandleYoAlternation (eStress, *itStressPos, pWordForm->m_sLemma, sEnding); if (rc != H_NO_ERROR) { continue; } pWordForm->m_sEnding = sEnding; pWordForm->m_llEndingDataId = llEndingKey; pWordForm->m_sWordForm = pWordForm->m_sLemma + sEnding; } m_pLexeme->AddWordForm (pWordForm); } else { vector<int>::iterator itStressPos = vecStress.begin(); for (; itStressPos != vecStress.end(); ++itStressPos) { if (itStressPos != vecStress.begin()) { CWordForm * pwfVariant = NULL; CloneWordForm (pWordForm, pwfVariant); pwfVariant->m_mapStress.clear(); pWordForm = pwfVariant; } pWordForm->m_mapStress[*itStressPos] = STRESS_PRIMARY; // primary rc = eHandleYoAlternation (eStress, *itStressPos, pWordForm->m_sLemma, sEnding); if (rc != H_NO_ERROR) { continue; } pWordForm->m_sWordForm = pWordForm->m_sLemma + sEnding; pWordForm->m_sEnding = sEnding; pWordForm->m_llEndingDataId = llEndingKey; m_pLexeme->AddWordForm (pWordForm); } } } // for (int iEnding = 0; ... ) } while (gramIterator.bIncrement()); return H_NO_ERROR; } // eBuildNounForms()
ET_ReturnCode CFormBuilderNouns::eHandleStemAugment (CEString& sLemma, ET_Number eNumber, ET_Case eCase) { ASSERT(m_pLexeme); // we assume base class ctor took care of this if (m_pLexeme->iStemAugment() < 1) { return H_NO_ERROR; } if (1 == m_pLexeme->iType()) { sLemma.sErase (sLemma.uiLength()-2, 2); // римлянин, южанин, армянин } if (3 == m_pLexeme->iType()) { CEString& sGs = m_pLexeme->sGraphicStem(); if (NUM_SG == eNumber) { if ((CASE_NOM == eCase) || (ANIM_NO == m_pLexeme->eAnimacy() && CASE_ACC == eCase)) { return H_NO_ERROR; } else { sLemma.sErase (sLemma.uiLength()-2, 1); return H_NO_ERROR; } } if (NUM_PL == eNumber) { if (sGs.bEndsWith (L"онок")) { sLemma.sErase (sLemma.uiLength()-4, 4); sLemma += L"ат"; return H_NO_ERROR; } if (sGs.bEndsWith (L"ёнок")) { sLemma.sErase (sLemma.uiLength()-4, 4); sLemma += L"ят"; return H_NO_ERROR; } if (sGs.bEndsWith (L"оночек")) { sLemma.sErase (sLemma.uiLength()-6, 6); if ((CASE_GEN == eCase) || (CASE_ACC == eCase && ANIM_YES == m_pLexeme->eAnimacy())) // they all should be animate? { sLemma += L"аток"; } else { sLemma += L"атк"; } return H_NO_ERROR; } if (sGs.bEndsWith (L"ёночек")) { sLemma.sErase (sLemma.uiLength()-6, 6); if ((CASE_GEN == eCase) || (CASE_ACC == eCase && ANIM_YES == m_pLexeme->eAnimacy())) // they all should be animate? { sLemma += L"яток"; } else { sLemma += L"ятк"; } return H_NO_ERROR; } } } // if (3 == m_pLexeme->i_Type) if (8 == m_pLexeme->iType()) { if (NUM_SG == eNumber) { if ((CASE_NOM == eCase) || (ANIM_NO == m_pLexeme->eAnimacy() && CASE_ACC == eCase)) { return H_NO_ERROR; } else { sLemma += L"ен"; } } else { sLemma += L"ен"; } } return H_NO_ERROR; } // eHandleStemAugment (...)
ET_ReturnCode CFormBuilderShortAdj::eGetStressTypes (ET_Number eNumber, ET_Gender eGender, vector<ET_StressLocation>& vecStressType) { ASSERT(m_pLexeme); // we assume base class ctor took care of this ET_ReturnCode rc = H_NO_ERROR; if (NUM_PL == eNumber && GENDER_UNDEFINED != eGender) { ASSERT(0); ERROR_LOG (L"Unexpected gender/number values."); return H_ERROR_INVALID_ARG; } if (NUM_SG == eNumber && GENDER_UNDEFINED == eGender) { ASSERT(0); ERROR_LOG (L"Unexpected gender/number values."); return H_ERROR_INVALID_ARG; } // if (GENDER_M == eGender) // { // ASSERT (NUM_SG == eNumber); // vecStressType.push_back (STRESS_LOCATION_STEM); // return H_NO_ERROR; // } // // Exception: part past passive short ending in stressed -Annyj/-jAnnyj; // see GDRL p. 86 footnote 4 // if (SUBPARADIGM_PART_PAST_PASS_SHORT == m_eSubparadigm) { CGramHasher hasher (POS_VERB, SUBPARADIGM_PART_PAST_PASS_LONG, CASE_NOM, NUM_SG, GENDER_M, PERSON_UNDEFINED, ANIM_NO, m_pLexeme->eAspect(), m_pLexeme->eIsReflexive()); CWordForm * pNSgMLong = NULL; rc = m_pLexeme->eWordFormFromHash (hasher.iGramHash(), 0, pNSgMLong); if (rc != H_NO_ERROR) { return rc; } if (NULL == pNSgMLong) { ASSERT(0); ERROR_LOG (L"Failed to obtain N Sg m of the long form."); return H_ERROR_POINTER; } CEString sNSgMLong (pNSgMLong->m_sWordForm); if (sNSgMLong.bEndsWith (L"анный") || sNSgMLong.bEndsWith (L"янный") || sNSgMLong.bEndsWith (L"енный")) { map<int, ET_StressType>::iterator itStress = pNSgMLong->m_mapStress.begin(); for (; itStress != pNSgMLong->m_mapStress.end(); ++itStress) { if (sNSgMLong.uiNSyllables()-2 == (*itStress).first && STRESS_PRIMARY == (*itStress).second) { // 1. -at'/-jat' [7] -- 1a, p. 83 // 2. monosyll verbs -- same as past: zvannyj (113) and dannyj 117 // &&&& TODO } } } } ET_AccentType eAt = AT_UNDEFINED; if (AT_UNDEFINED == m_eAccentType2) { eAt = m_eAccentType1; } else { eAt = m_eAccentType2; } switch (eAt) { case AT_UNDEFINED: { ASSERT(0); ERROR_LOG (L"Undefined accent type."); return H_ERROR_GENERAL; } case AT_A: { vecStressType.push_back (STRESS_LOCATION_STEM); // Type sorvana: GDRL, p. 86 if (SUBPARADIGM_PART_PAST_PASS_SHORT == m_eSubparadigm && GENDER_F == eGender && AT_C == m_pLexeme->eAccentType2()) { vecStressType.push_back (STRESS_LOCATION_ENDING); } return H_NO_ERROR; } case AT_A1: { if (GENDER_M == eGender) { ASSERT(NUM_SG == eNumber); vecStressType.push_back(STRESS_LOCATION_STEM); return H_NO_ERROR; } if (GENDER_F == eGender) { ASSERT (NUM_SG == eNumber); vecStressType.push_back (STRESS_LOCATION_STEM); vecStressType.push_back (STRESS_LOCATION_ENDING); return H_NO_ERROR; } if (GENDER_N == eGender) { ASSERT (NUM_SG == eNumber); vecStressType.push_back (STRESS_LOCATION_STEM); return H_NO_ERROR; } if (NUM_PL == eNumber) { vecStressType.push_back (STRESS_LOCATION_STEM); return H_NO_ERROR; } ASSERT(0); ERROR_LOG (L"Bad arguments."); return H_ERROR_INVALID_ARG; } case AT_B: { vecStressType.push_back (STRESS_LOCATION_ENDING); return H_NO_ERROR; } case AT_B1: { if (GENDER_M == eGender) { ASSERT(NUM_SG == eNumber); vecStressType.push_back(STRESS_LOCATION_ENDING); return H_NO_ERROR; } if (GENDER_F == eGender || GENDER_N == eGender) { ASSERT (NUM_SG == eNumber); vecStressType.push_back (STRESS_LOCATION_ENDING); return H_NO_ERROR; } if (NUM_PL == eNumber) { ASSERT (GENDER_UNDEFINED == eGender); vecStressType.push_back (STRESS_LOCATION_STEM); vecStressType.push_back (STRESS_LOCATION_ENDING); return H_NO_ERROR; } ASSERT(0); ERROR_LOG (L"Bad arguments."); return H_ERROR_INVALID_ARG; } case AT_C: { if (GENDER_M == eGender) { ASSERT(NUM_SG == eNumber); vecStressType.push_back(STRESS_LOCATION_STEM); return H_NO_ERROR; } if (GENDER_F == eGender) { ASSERT (NUM_SG == eNumber); vecStressType.push_back (STRESS_LOCATION_ENDING); return H_NO_ERROR; } if (GENDER_N == eGender || NUM_PL == eNumber) { vecStressType.push_back (STRESS_LOCATION_STEM); return H_NO_ERROR; } ASSERT(0); ERROR_LOG (L"Bad arguments."); return H_ERROR_INVALID_ARG; } case AT_C1: { if (GENDER_M == eGender) { ASSERT(NUM_SG == eNumber); vecStressType.push_back(STRESS_LOCATION_STEM); return H_NO_ERROR; } if (GENDER_F == eGender) { ASSERT (NUM_SG == eNumber); vecStressType.push_back (STRESS_LOCATION_ENDING); return H_NO_ERROR; } if (GENDER_N == eGender) { ASSERT (NUM_SG == eNumber); vecStressType.push_back (STRESS_LOCATION_STEM); return H_NO_ERROR; } if (NUM_PL == eNumber) { ASSERT (GENDER_UNDEFINED == eGender); vecStressType.push_back (STRESS_LOCATION_STEM); vecStressType.push_back (STRESS_LOCATION_ENDING); return H_NO_ERROR; } ASSERT(0); ERROR_LOG (L"Bad arguments."); return H_ERROR_INVALID_ARG; } case AT_C2: { if (GENDER_M == eGender) { ASSERT(NUM_SG == eNumber); vecStressType.push_back(STRESS_LOCATION_STEM); return H_NO_ERROR; } if (GENDER_F == eGender) { ASSERT (NUM_SG == eNumber); vecStressType.push_back (STRESS_LOCATION_ENDING); return H_NO_ERROR; } if (GENDER_N == eGender) { ASSERT (NUM_SG == eNumber); vecStressType.push_back (STRESS_LOCATION_STEM); vecStressType.push_back (STRESS_LOCATION_ENDING); return H_NO_ERROR; } if (NUM_PL == eNumber) { ASSERT (GENDER_UNDEFINED == eGender); vecStressType.push_back (STRESS_LOCATION_STEM); vecStressType.push_back (STRESS_LOCATION_ENDING); return H_NO_ERROR; } ASSERT(0); ERROR_LOG (L"Bad arguments."); return H_ERROR_INVALID_ARG; } default: { ASSERT(0); ERROR_LOG (L"Illegal accent type."); return H_ERROR_INVALID_ARG; } } return H_ERROR_INVALID_ARG; } // eGetStressType()
ET_ReturnCode CFormBuilderShortAdj::eHandleDeviations (CWordForm * pWordForm) { ASSERT(m_pLexeme); // we assume base class ctor took care of this ET_ReturnCode rc = H_NO_ERROR; try { // // A. Common deviations defined for adjectives (1-2) // bool bHasCD = false; bool bOptionalCD = false; int iCd = 1; for (; iCd <= 2; ++iCd) { map<int, bool>::iterator itCd = m_mapCommonDeviations.find (iCd); if (m_mapCommonDeviations.end() != itCd) { bOptionalCD = (*itCd).second; break; } if (m_pLexeme->bFindCommonDeviation (iCd, bOptionalCD)) { break; } } // // Only adjectives in -nnyj/-nnij or participia // if (1 == iCd || 2 == iCd) { if (1 == iCd && GENDER_M != pWordForm->m_eGender) { return H_NO_MORE; } if (bOptionalCD && SUBPARADIGM_SHORT_ADJ == m_eSubparadigm) // store both forms { CWordForm * pMVariant = NULL; CloneWordForm (pWordForm, pMVariant); m_pLexeme->AddWordForm (pMVariant); pWordForm = pMVariant; } if (GENDER_M == pWordForm->m_eGender) { pWordForm->m_sWordForm.sErase (pWordForm->m_sWordForm.uiLength()-1); if (m_bFleetingVowel) { pWordForm->m_sWordForm.sErase (pWordForm->m_sWordForm.uiLength()-1); } } else { pWordForm->m_sWordForm.sErase (pWordForm->m_sWordForm.uiLength()-2, 1); } } // if (1 == iCd || 2 == iCd) // // Common deviations devined for verbs (7-8) // if (m_pLexeme->bHasCommonDeviation(7)) { iCd = 7; } else if (m_pLexeme->bHasCommonDeviation(8)) // TODO -- what's that?? { iCd = -1; } else { iCd = -1; } if (iCd > 0 && SUBPARADIGM_PART_PAST_PASS_SHORT == m_eSubparadigm) { if (m_pLexeme->bDeviationOptional(iCd)) // store both forms { CWordForm * pMVariant = NULL; CloneWordForm (pWordForm, pMVariant); m_pLexeme->AddWordForm (pMVariant); pWordForm = pMVariant; } map<int, ET_StressType> mapCorrectedStress; map<int, ET_StressType>::iterator itStressPos = pWordForm->m_mapStress.begin(); for (; itStressPos != pWordForm->m_mapStress.end(); ++itStressPos) { if (!(*itStressPos).second) { mapCorrectedStress[(*itStressPos).first] = STRESS_SECONDARY; continue; } if ((*itStressPos).first < 1) { ASSERT(0); ERROR_LOG (L"Unexpected stress position in cd-7 or cd-8 participle."); return H_ERROR_UNEXPECTED; } CEString sWf (pWordForm->m_sWordForm); mapCorrectedStress[sWf.uiNSyllables()-1] = STRESS_PRIMARY; } } } catch (CException& ex) { CEString sMsg (L"Exception: "); sMsg += ex.szGetDescription(); ERROR_LOG (sMsg); return H_EXCEPTION; } return H_NO_ERROR; } // eHandleDeviations (...)
int _tmain(int argc, _TCHAR* argv[]) { wstring sReplaceableB (L"0123456789012345"); wstring sReplacedB = sReplaceableB.replace (5, 6, L"abcd"); CEString sReplacableC(L"0123456789012345678901234567890123567890"); CEString sReplacedCC = sReplacableC.sReplace(L"567890", L"abcd"); CEString sReplacedC = sReplacableC.sReplace(5, 6, L"abcd"); sReplaceableB = L"0123456789"; sReplacedB = sReplaceableB.replace (5, 3, L"a"); sReplaceableB = L"0123456789"; sReplacedB = sReplaceableB.replace (8, 2, L"ab"); sReplaceableB = L"0123456789"; sReplacedB = sReplaceableB.replace (5, 3, L"a"); sReplaceableB = L"0123456789"; sReplacedB = sReplaceableB.replace (8, 2, L"abc"); wstring sEraseableB (L"0123456789"); wstring sErasedB = sEraseableB.erase (5, 3); sEraseableB = L"0123456789"; sErasedB = sEraseableB.erase (5, 5); sEraseableB = L"0123456789"; sErasedB = sEraseableB.erase (5, 7); sEraseableB = L"0123456789"; sErasedB = sEraseableB.erase (5); sEraseableB = L"0123456789"; sErasedB = sEraseableB.erase(); try { sEraseableB = L"0123456789"; sErasedB = sEraseableB.erase (12, 7); } catch (...) { } try { sEraseableB = L"0123456789"; sErasedB = sEraseableB.erase (12); } catch (...) { } // Ctors CEString sEmptyString; if (0 != sEmptyString.uiLength() || 0 != sEmptyString.uiGetNumOfTokens() || 0 != sEmptyString.uiGetNumOfFields() || 0 != sEmptyString.uiGetVisibleLength()) { ERROR_LOG (L"Initialization error"); } CEString sCopy (sEmptyString); if (0 != sCopy.uiLength() || 0 != sCopy.uiGetNumOfTokens() || 0 != sCopy.uiGetNumOfFields() || 0 != sCopy.uiGetVisibleLength()) { ERROR_LOG (L"Initialization error"); } sCopy = L"0123456789"; CEString sCopy2 (sCopy); sCopy2.SetBreakChars (L" -/"); CEString sCopy3 (sCopy2); CEString sFromCString (L"0123456789"); if (10 != sFromCString.uiLength() || 1 != sFromCString.uiGetNumOfTokens() || 1 != sFromCString.uiGetNumOfFields() || 10 != sFromCString.uiGetVisibleLength()) { ERROR_LOG (L"Initialization error"); } // TODO: operator () CEString sSquareBracketsTest (L"0123456789"); sSquareBracketsTest[1] = L'a'; if (sSquareBracketsTest != L"0a23456789") { ERROR_LOG (L"Square brackets operator error"); } sSquareBracketsTest = L"0123456789"; sSquareBracketsTest[0] = L'a'; if (sSquareBracketsTest != L"a123456789") { ERROR_LOG (L"Square brackets operator error"); } sSquareBracketsTest = L"0123456789"; sSquareBracketsTest[9] = L'a'; if (sSquareBracketsTest != L"012345678a") { ERROR_LOG (L"Square brackets operator error"); } sSquareBracketsTest = L"0123456789"; CEString sLetter = sSquareBracketsTest[1]; if (L"1" != sLetter) { ERROR_LOG (L"Square brackets operator error"); } // Comparison ERelation eRet = CEString::eCompare (L"1234567", L"1234567"); if (ecEqual != eRet) { ERROR_LOG (L"Comparison error"); } eRet = CEString::eCompare (L"1234567", L"1234566"); if (eRet != ecGreater) { ERROR_LOG (L"Comparison error"); } eRet = CEString::eCompare (L"1234566", L"1234567"); if (eRet != ecLess) { ERROR_LOG (L"Comparison error"); } eRet = CEString::eCompare (L"123456", L"1234567"); if (eRet != ecLess) { ERROR_LOG (L"Comparison error"); } eRet = CEString::eCompare (L"1234567", L"123456"); if (eRet != ecGreater) { ERROR_LOG (L"Comparison error"); } eRet = CEString::eCompareNoCase (L"AbCdEfG", L"ABCDEFg"); if (ecEqual != eRet) { ERROR_LOG (L"Comparison error"); } eRet = CEString::eCompareNoCase (L"АбВгДЕ", L"АБВГДе"); if (ecEqual != eRet) { ERROR_LOG (L"Comparison error"); } bool bRet = CEString::bIn (L'2', L"0123456789"); if (!bRet) { ERROR_LOG (L"bIn() failed."); } bRet = CEString::bIn (L'a', L"0123456789"); if (bRet) { ERROR_LOG (L"bIn() failed."); } CEString sSearcheable (L"0123456789"); unsigned int uiFindRet = sSearcheable.uiFind (L"123"); if (1 != uiFindRet) { ERROR_LOG (L"uiFind() failed."); } uiFindRet = sSearcheable.uiFind (L"abc"); if (ecNotFound != uiFindRet) { ERROR_LOG (L"uiFind() failed."); } sSearcheable = L"aBcDeFgHiJ"; uiFindRet = sSearcheable.uiFindNoCase (L"bCDEF"); if (ecNotFound == uiFindRet) { ERROR_LOG (L"uiFindNoCase() failed."); } sSearcheable = L"012345543210"; uiFindRet = sSearcheable.uiRFind (L"5"); if (6 != uiFindRet) { ERROR_LOG (L"uiRFind() failed."); } // unsigned int uiRFindNoCase (const wchar_t * szRhs) const sSearcheable = L"0123456789"; uiFindRet = sSearcheable.uiFindFirstOf (L"234"); if (2 != uiFindRet) { ERROR_LOG (L"uiFindFirstOf() failed."); } // unsigned int uiFindFirstOfNoCase (const wchar_t * szSet) const sSearcheable = L"0120120123456789"; uiFindRet = sSearcheable.uiFindOneOf (3, L"234"); if (5 != uiFindRet) { ERROR_LOG (L"uiFindOneOf() failed."); } sSearcheable = L"0123456789"; uiFindRet = sSearcheable.uiFindLastOf (L"234"); if (4 != uiFindRet) { ERROR_LOG (L"uiFindLastOf() failed."); } // unsigned int uiFindLastOfNoCase (const wchar_t * szSet) const bRet = sSearcheable.bStartsWith (L"012"); if (!bRet) { ERROR_LOG (L"bStartsWith() failed."); } bRet = sSearcheable.bStartsWith (L"234"); if (bRet) { ERROR_LOG (L"bStartsWith() failed."); } sSearcheable = L"aBcDeFgHiJ"; bRet = sSearcheable.bStartsWithNoCase (L"abcd"); if (!bRet) { ERROR_LOG (L"bStartsWithNoCase() failed."); } sSearcheable = L"0123456789"; bRet = sSearcheable.bStartsWithOneOf (L"012"); if (!bRet) { ERROR_LOG (L"bStartsWithOneOf() failed."); } bRet = sSearcheable.bStartsWithOneOf (L"123"); if (bRet) { ERROR_LOG (L"bStartsWithOneOf() failed."); } sSearcheable = L"aBcDeFgHiJ"; bRet = sSearcheable.bStartsWithOneOfNoCase (L"abc"); if (!bRet) { ERROR_LOG (L"bStartsWithOneOf() failed."); } bRet = sSearcheable.bStartsWithOneOfNoCase (L"bc"); if (bRet) { ERROR_LOG (L"bStartsWithOneOf() failed."); } sSearcheable = L"аБвГдЕёжзи"; bRet = sSearcheable.bStartsWithOneOfNoCase (L"абв"); if (!bRet) { ERROR_LOG (L"bStartsWithOneOf() failed."); } bRet = sSearcheable.bStartsWithOneOfNoCase (L"бв"); if (bRet) { ERROR_LOG (L"bStartsWithOneOf() failed."); } sSearcheable = L"0123456789"; bRet = sSearcheable.bEndsWith (L"789"); if (!bRet) { ERROR_LOG (L"bStartsWith() failed."); } bRet = sSearcheable.bEndsWith (L"123"); if (bRet) { ERROR_LOG (L"bStartsWith() failed."); } sSearcheable = L"abcdeFgHiJ"; bRet = sSearcheable.bEndsWithNoCase (L"hij"); if (!bRet) { ERROR_LOG (L"bEndsWithNoCase() failed."); } bRet = sSearcheable.bEndsWithNoCase (L"ghi"); if (bRet) { ERROR_LOG (L"bEndsWithNoCase() failed."); } sSearcheable = L"абвгдЕёЖзИ"; bRet = sSearcheable.bEndsWithNoCase (L"жзи"); if (!bRet) { ERROR_LOG (L"bEndsWithNoCase() failed."); } bRet = sSearcheable.bEndsWithNoCase (L"ёжз"); if (bRet) { ERROR_LOG (L"bEndsWithNoCase() failed."); } sSearcheable = L"0123456789"; bRet = sSearcheable.bEndsWithOneOf (L"ab9"); if (!bRet) { ERROR_LOG (L"bEndsWithOneOf() failed."); } bRet = sSearcheable.bEndsWithOneOf (L"ab8"); if (bRet) { ERROR_LOG (L"bEndsWithOneOf() failed."); } sSearcheable = L"aBcDeFgHiJ"; bRet = sSearcheable.bEndsWithOneOfNoCase (L"abj"); if (!bRet) { ERROR_LOG (L"bEndsWithOneOfNoCase failed."); } bRet = sSearcheable.bEndsWithOneOfNoCase (L"abc"); if (bRet) { ERROR_LOG (L"bEndsWithOneOfNoCase failed."); } sSearcheable = L"абвгдЕёЖзИ"; bRet = sSearcheable.bEndsWithOneOfNoCase (L"abи"); if (!bRet) { ERROR_LOG (L"bEndsWithOneOfNoCase failed."); } bRet = sSearcheable.bEndsWithOneOfNoCase (L"abc"); if (bRet) { ERROR_LOG (L"bEndsWithOneOfNoCase failed."); } // Operators CEString sLhs (L"01234"); CEString sRhs (L"56789"); bRet = (sLhs == sRhs); if (bRet) { ERROR_LOG (L"Comparison error"); } bRet = (sLhs == L"01234"); if (!bRet) { ERROR_LOG (L"Comparison error"); } bRet = (L"01234" == sLhs); if (!bRet) { ERROR_LOG (L"Comparison error"); } // CString csLhs (L"01234"); // CString csRhs (L"56789"); // bRet = (L"01234" == csLhs); // if (!bRet) // { // ERROR_LOG (L"CString behavior does not match CEString behavior"); // } bRet = (sLhs < sRhs); if (!bRet) { ERROR_LOG (L"Comparison error"); } bRet = (sLhs > sRhs); if (bRet) { ERROR_LOG (L"Comparison error"); } bRet = (sLhs <= sRhs); if (!bRet) { ERROR_LOG (L"Comparison error"); } bRet = (sLhs >= sRhs); if (bRet) { ERROR_LOG (L"Comparison error"); } bRet = (sLhs >= sLhs); if (!bRet) { ERROR_LOG (L"Comparison error"); } sEmptyString = L"0123456"; if (sEmptyString != L"0123456") { ERROR_LOG (L"Assignemnt or comparison error"); } sEmptyString = sRhs; if (sEmptyString != sRhs) { ERROR_LOG (L"Assignemnt or comparison error"); } //CEString sResult = sLhs + sRhs; //if (sResult != L"0123456789") //{ // ERROR_LOG (L"Concatenation or comparison error"); //} //sResult += L"<--Concatenated"; //if (sResult != L"0123456789<--Concatenated") //{ // ERROR_LOG (L"Concatenation or comparison error"); //} CEString sInsertable (L"0123789"); CEString sInserted = sInsertable.sInsert (4, L"456"); if (sInserted != sInsertable || sInsertable != L"0123456789") { ERROR_LOG (L"Insertion error"); } sInsertable = L"012456789"; sInserted = sInsertable.sInsert (3, L'3'); if (sInserted != sInsertable || sInsertable != L"0123456789") { ERROR_LOG (L"Insertion error"); } CEString sErasable (L"012abcd3456789"); CEString sErased = sErasable.sErase (3, 4); if (sErased != sErasable || sErasable != L"0123456789") { ERROR_LOG (L"Erase error"); } sErasable = L"0123456789"; sErased = sErasable.sErase (3, 7); if (sErased != sErasable || sErasable != L"012") { ERROR_LOG (L"Erase error"); } sErasable = L"0123456789"; sErased = sErasable.sErase (3, 40); if (sErased != sErasable || sErasable != L"012") { ERROR_LOG (L"Erase error"); } sErasable = L"0123456789"; sErased = sErasable.sErase (3); if (sErased != sErasable || sErasable != L"012") { ERROR_LOG (L"Erase error"); } sErasable = L"0123456789a"; sErased = sErasable.sErase (10); if (sErased != sErasable || sErasable != L"0123456789") { ERROR_LOG (L"Erase error"); } sErasable.Erase(); if (!sErasable.bIsEmpty() || sErasable.uiLength() != 0) { ERROR_LOG (L"Erase error"); } sErasable = L"0123456789"; CEString sConvertToUppercase(L"aAbBcC"); sConvertToUppercase.ToUpper(); if (sConvertToUppercase != L"AABBCC") { ERROR_LOG(L"ToUpper error"); } sConvertToUppercase = CEString::sToUpper(L"aAbBcC"); if (sConvertToUppercase != L"AABBCC") { ERROR_LOG(L"ToUpper error"); } CEString sConvertToUppercaseCyr(L"aABbcCаАбБвВ"); sConvertToUppercaseCyr.ToUpper(); if (sConvertToUppercaseCyr != L"AABBCCААББВВ") { ERROR_LOG(L"ToUpper error for Cyrillic"); } sConvertToUppercaseCyr = CEString::sToUpper(L"aAbBcCаАбБвВ"); if (sConvertToUppercaseCyr != L"AABBCCААББВВ") { ERROR_LOG(L"sToUpper error for Cyrillic"); } CEString sConvertToLowercase(L"aABbcC"); sConvertToLowercase.ToLower(); if (sConvertToLowercase != L"aabbcc") { ERROR_LOG(L"ToLower error"); } sConvertToLowercase = CEString::sToLower(L"aAbBcC"); if (sConvertToLowercase != L"aabbcc") { ERROR_LOG(L"ToLower error"); } CEString sConvertToLowercaseCyr(L"aABbcCаАбБвВ"); sConvertToLowercaseCyr.ToLower(); if (sConvertToLowercaseCyr != L"aabbccааббвв") { ERROR_LOG(L"ToLower error for Cyrillic"); } sConvertToLowercaseCyr = CEString::sToLower(L"aAbBcCаАбБвВ"); if (sConvertToLowercaseCyr != L"aabbccааббвв") { ERROR_LOG(L"sToLower error for Cyrillic"); } CEString sFromAscii = CEString::sToString("abcdefgxyzABCDEFGXYZ01234567890.,!"); if (sFromAscii != L"abcdefgxyzABCDEFGXYZ01234567890.,!") { ERROR_LOG(L"sToString error for ascii conversion"); } CEString sReplaceable(L"01abcd6789"); CEString sReplaced = sReplaceable.sReplace (2, L"2345"); if (sReplaced != sReplaceable || sReplaceable != L"0123456789") { ERROR_LOG (L"Replace error"); } sReplaceable = L"0123456abc"; sReplaced = sReplaceable.sReplace (7, L"789"); if (sReplaced != sReplaceable || sReplaceable != L"0123456789") { ERROR_LOG (L"Replace error"); } sReplaceable = L"0123456a89"; sReplaced = sReplaceable.sReplace (7, L'7'); if (sReplaced != sReplaceable || sReplaceable != L"0123456789") { ERROR_LOG (L"Replace error"); } sReplaceable = L"012345678a"; sReplaced = sReplaceable.sReplace (9, L'9'); if (sReplaced != sReplaceable || sReplaceable != L"0123456789") { ERROR_LOG (L"Replace error"); } sReplaceable = L"01234abc89"; sReplaced = sReplaceable.sReplace (5, 3, L"567"); if (sReplaced != sReplaceable || sReplaceable != L"0123456789") { ERROR_LOG (L"Replace error"); } sReplaceable = L"01234aaa6789"; sReplaced = sReplaceable.sReplace (5, 3, L"5"); if (sReplaced != sReplaceable || sReplaceable != L"0123456789") { ERROR_LOG (L"Replace error"); } sReplaceable = L"01234567ab"; sErased = sReplaceable.sReplace (8, 2, L"89"); if (sReplaced != sReplaceable || sReplaceable != L"0123456789") { ERROR_LOG (L"Replace error"); } sReplaceable = L"01234567a9"; sReplaced = sReplaceable.sReplace (8, 2, L"8"); if (sReplaced != sReplaceable || sReplaceable != L"012345678") { ERROR_LOG (L"Replace error"); } sReplaceable = L"01234567ab"; sReplaced = sReplaceable.sReplace (8, 2, L"890"); if (sReplaced != sReplaceable || sReplaceable != L"01234567890") { ERROR_LOG (L"Replace error"); } sReplaceable = L"0ё2345ё78ёё"; sReplaceable.Replace (0, 10, L'ё', L'е'); if (sReplaceable != L"0е2345е78ее") { ERROR_LOG (L"Replace error"); } CEString sTrimmable (L" 01234 "); sTrimmable.TrimLeft(); if (sTrimmable != L"01234 ") { ERROR_LOG (L"Trim or comparison error"); } sTrimmable.TrimRight(); if (sTrimmable != L"01234") { ERROR_LOG (L"Trim or comparison error"); } sTrimmable = L" 01234 "; sTrimmable.Trim(); if (sTrimmable != L"01234") { ERROR_LOG (L"Trim or comparison error"); } sTrimmable = L"=&=&=01234&&&=="; sTrimmable.TrimLeft (L"=&"); if (sTrimmable != L"01234&&&==") { ERROR_LOG (L"Trim or comparison error"); } sTrimmable.Trim (L"=&"); if (sTrimmable != L"01234") { ERROR_LOG (L"Trim or comparison error"); } sTrimmable = L"=&=&=01234&&&=="; sTrimmable.Trim (L"=&"); if (sTrimmable != L"01234") { ERROR_LOG (L"Trim or comparison error"); } CEString sReversable (L"0123456789"); sReversable.Reverse(); if (sReversable != L"9876543210") { ERROR_LOG (L"Reversing error"); } CEString sWhole (L"0123456789"); CEString sSubstr = sWhole.sSubstr (1, 3); if (sSubstr != L"123") { ERROR_LOG (L"Trim or comparison error"); } sSubstr = sWhole.sSubstr (7); if (sSubstr != L"789") { ERROR_LOG (L"Trim or comparison error"); } CEString sFields (L"123 456 789"); sFields.SetBreakChars (L" "); CEString sField = sFields.sGetField (1); if (sField != L"456") { ERROR_LOG (L"Tokenizer or comparison error"); } StToken stToken = sFields.stGetField (0); if (3 != stToken.uiLength || 0 != stToken.uiOffset || ecTokenText != stToken.eType) { ERROR_LOG (L"Tokenizer or comparison error"); } stToken = sFields.stGetField (0, ecTokenSpace); if (1 != stToken.uiLength || 3 != stToken.uiOffset || ecTokenSpace != stToken.eType) { ERROR_LOG (L"Tokenizer or comparison error"); } try { // stToken = sFields.stGetField (99); // ERROR_LOG (L"Tokenizer or comparison error"); // Exception expected } catch (CException& ex) { // ::MessageBox (NULL, ex.sGetDescription().c_str(), L"Kai Exception", MB_ICONWARNING); } // ST_Token st_GetFieldFromOffset (int i_offset, // et_TokenType eo_type = ec_TokenText); stToken = sFields.stGetTokenFromOffset (6); if (3 != stToken.uiLength || 4 != stToken.uiOffset || ecTokenText != stToken.eType) { ERROR_LOG (L"Tokenizer or comparison error"); } ETokenType eType = sFields.eGetTokenType (1); // et_TokenType eo_GetTokenType (int i_offset, int i_at); if (ecTokenBreakChars != eType) { ERROR_LOG (L"Tokenizer or comparison error"); } stToken = sFields.stGetToken (1); if (1 != stToken.uiLength || 3 != stToken.uiOffset || ecTokenSpace != stToken.eType) { ERROR_LOG (L"Tokenizer or comparison error"); } const StToken& rstToken = sFields.rstGetToken (1); if (1 != stToken.uiLength || 3 != stToken.uiOffset || ecTokenSpace != stToken.eType) { ERROR_LOG (L"Tokenizer or comparison error"); } CEString sToken = sFields.sGetToken (1); if (sToken != L" ") { ERROR_LOG (L"Tokenizer or comparison error"); } try { CEString sToken1 = sFields.sGetToken(999); if (sToken1 != L" ") { ERROR_LOG(L"Tokenizer or comparison error"); } } catch (CException ex) { CEString sMsg(L"Exception: "); sMsg += ex.szGetDescription(); ERROR_LOG(sMsg); } bool b_ = sFields.bGetNextToken(stToken); if (!b_ || ecTokenText != stToken.eType || 4 != stToken.uiOffset || 3 != stToken.uiLength) { ERROR_LOG (L"Tokenizer or comparison error"); } b_ = sFields.bGetPrevToken (stToken); if (!b_ || ecTokenBreakChars != stToken.eType || 3 != stToken.uiOffset || 1 != stToken.uiLength) { ERROR_LOG (L"Tokenizer or comparison error"); } unsigned int uiTokenNum = sFields.uiGetTokenNum (stToken); if (1 != uiTokenNum) { ERROR_LOG (L"Tokenizer or comparison error"); } unsigned int uiFields = sFields.uiGetNumOfFields(); if (3 != uiFields) { ERROR_LOG (L"Tokenizer or comparison error"); } uiFields = sFields.uiGetNumOfFields (ecTokenSpace); if (2 != uiFields) { ERROR_LOG (L"Tokenizer or comparison error"); } // uiFields = sFields.uiGetNumOfFields (3, 6); uiFields = sFields.uiNFields(); if (3 != uiFields) { ERROR_LOG (L"Tokenizer or comparison error"); } // uiFields = sFields.uiNFields (3, 6); unsigned int uiTokens = sFields.uiGetNumOfTokens(); if (5 != uiTokens) { ERROR_LOG (L"Tokenizer or comparison error"); } uiTokens = sFields.uiNTokens(); if (5 != uiTokens) { ERROR_LOG (L"Tokenizer or comparison error"); } unsigned int uiVLength = sFields.uiGetVisibleLength(); if (11 != uiVLength) { ERROR_LOG (L"Tokenizer or comparison error"); } unsigned int uiFLength = sFields.uiGetFieldLength (1); if (3 != uiFLength) { ERROR_LOG (L"Tokenizer or comparison error"); } // CEString s (L"abcdefg"); // wchar_t * szData = (wchar_t*)s; CEString sSyllables (L"бавогузюы"); sSyllables.SetVowels (L"аеёиоуыэюя"); unsigned int uiSyllables = sSyllables.uiGetNumOfSyllables(); if (5 != uiSyllables) { ERROR_LOG (L"Syllable count error"); } uiSyllables = sSyllables.uiNSyllables(); if (5 != uiSyllables) { ERROR_LOG (L"Syllable count error"); } // Vowels & consonants unsigned int uiVowelPos = sSyllables.uiGetVowelPos (3); if (7 != uiVowelPos) { ERROR_LOG (L"Vowel position error"); } unsigned int uiSyllPos = sSyllables.uiGetSyllableFromVowelPos (7); if (3 != uiSyllPos) { ERROR_LOG (L"Syllable position error"); } { CEString sConvert = CEString::sToString (9999999999999); if (L"9999999999999" != sConvert) { ERROR_LOG(L"Large int conversion error"); } int i_ = 999999; sConvert = CEString::sToString (i_); if (L"999999" != sConvert) { ERROR_LOG(L"Int conversion error"); } } { CEString sConvert = CEString::sToString(999999999.9999); double d_ = 999999.999; sConvert = CEString::sToString(d_); } // // Done! // CLogger::pGetInstance()->Flush(); _CrtDumpMemoryLeaks(); }
int CT_LexPreprocessor::iClassifyStems() // For every endings subtable, looks for the stems usable with it and // stores up to NUM_SFX their longest common suffixes in the database { if (m_pDb == NULL) { return -1; } const int MIN_NUMBER_OF_STEMS = 70; const int NUM_SFX = 5; const int MAX_NUM_SFX = 24; CEString sQuery, sStem; CEString *arr_sStems; CEString **parr_sSfx; vector<CEString> vecStems; int iLastSubtable = 0, iStem; iLastSubtable = m_pDb->iLastID(L"endings_meta"); for (int iSubtable = 0; iSubtable <= iLastSubtable; ++iSubtable) { vecStems.clear(); CEString sFirstLemma = L""; int iCutRight = 0; CEString sLemmaEnding = L""; vLongStemsBySubtable(iSubtable, 2, vecStems, sFirstLemma); if (vecStems.size() < MIN_NUMBER_OF_STEMS) { continue; } // Find the longest common prefix of the first stem and the corresponding lemma CEString* arr_sStemAndLemma; CEString** parr_sPfx; arr_sStemAndLemma = new CEString[2]; arr_sStemAndLemma[0] = vecStems[0]; arr_sStemAndLemma[1] = sFirstLemma; parr_sPfx = new CEString*; *parr_sPfx = new CEString[1]; int iPfx = iLCP(arr_sStemAndLemma, parr_sPfx, 2, 1); if (iPfx <= 0) { continue; } CEString sCommonPfx = (*parr_sPfx)[0]; iCutRight = vecStems[0].uiLength() - sCommonPfx.uiLength(); if (iCutRight >= 4) { continue; } sLemmaEnding = sFirstLemma.sSubstr(sCommonPfx.uiLength(), sFirstLemma.uiLength() - sCommonPfx.uiLength()); // Find longest common suffixes of the stems found iStem = 0; arr_sStems = new CEString[vecStems.size()]; parr_sSfx = new CEString*; *parr_sSfx = new CEString[1]; for (vector<CEString>::iterator iterStems = vecStems.begin(); iterStems != vecStems.end(); ++iterStems, ++iStem) { // We reverse the stem so that i_LCP could find suffixes // instead of prefixes // reverse((*iter_stems).begin(), (*iter_stems).end()); (*iterStems).Reverse(); arr_sStems[iStem] = *iterStems; } // several attemps int iSfx = 0; int iMaxSfx = NUM_SFX; while (iSfx <= 0 && iMaxSfx <= MAX_NUM_SFX) { delete[] *parr_sSfx; delete parr_sSfx; parr_sSfx = new CEString*; *parr_sSfx = new CEString[1]; iSfx = iLCP(arr_sStems, parr_sSfx, vecStems.size(), iMaxSfx); if (iSfx == 1 && (*parr_sSfx)[0].uiLength() <= 0) { iSfx = 0; } iMaxSfx += 2; } vInsertCommonSfx(parr_sSfx, iSfx, iSubtable, vecStems.size(), iCutRight, sLemmaEnding); delete[] arr_sStems; delete[] *parr_sSfx; delete parr_sSfx; // TEST //if (i_subtable > 100) //{ // break; //} } return 0; }
ET_ReturnCode CLexeme::eLoadIrregularForms() { ET_ReturnCode rc = H_NO_ERROR; if (!m_stProperties.bHasIrregularForms) { return H_FALSE; } m_stProperties.bHasIrregularVariants = false; CEString sQuery (L"SELECT id, gram_hash, wordform, is_alternative FROM irregular_forms WHERE descriptor_id = "); sQuery += CEString::sToString (m_stProperties.iDbKey); sQuery += L";"; CSqlite * pDb = NULL; m_mmapIrregularForms.clear(); try { pDb = m_pDictionary->pGetDbHandle(); unsigned int uiQueryHandle = pDb->uiPrepareForSelect (sQuery); while (pDb->bGetRow(uiQueryHandle)) { //StIrregularForm stForm; int iId = -1; int iHash = -1; CEString sForm; bool bIsVariant = false; pDb->GetData (0, iId, uiQueryHandle); pDb->GetData (1, iHash, uiQueryHandle); pDb->GetData (2, sForm, uiQueryHandle); pDb->GetData (3, bIsVariant, uiQueryHandle); if (bIsVariant) { m_stProperties.bHasIrregularVariants = true; } CEString sStressQuery (L"SELECT position, is_primary FROM irregular_stress WHERE form_id = "); sStressQuery += CEString::sToString (iId); sStressQuery += L";"; CWordForm * pWf = new CWordForm(iHash); pWf->m_pLexeme = this; pWf->m_bIrregular = true; pWf->m_sWordForm = sForm; unsigned int uiStressHandle = pDb->uiPrepareForSelect (sStressQuery); while (pDb->bGetRow (uiStressHandle)) { int iPos = -1; bool bPrimary = false; pDb->GetData (0, iPos, uiStressHandle); pDb->GetData (1, bPrimary, uiStressHandle); int iStressedSyll = sForm.uiGetSyllableFromVowelPos (iPos); pWf->m_mapStress[iStressedSyll] = bPrimary ? STRESS_PRIMARY : STRESS_SECONDARY; } pDb->Finalize (uiStressHandle); StIrregularForm stIf(pWf, bIsVariant); pair<int, StIrregularForm> pairHashToWordForm (iHash, stIf); m_mmapIrregularForms.insert (pairHashToWordForm); } // while (pDb->b_GetRow()) pDb->Finalize(uiQueryHandle); } catch (CException ex) { ERROR_LOG (ex.szGetDescription()); rc = H_EXCEPTION; } catch (...) { CEString sMsg; CEString sError; try { pDb->GetLastError (sError); sMsg += CEString (L", error %d: "); sMsg += sError; } catch (...) { sMsg = L"Apparent DB error "; } sMsg += CEString::sToString(pDb->iGetLastError()); ERROR_LOG (sMsg); rc = H_EXCEPTION; } return rc; } // eGetIrregularForms()