CWord::size_type LongestPieceFixedStart(const CWord& w) { CWord::size_type result = 1; auto shifted = w; shifted.CyclicLeftShift(); while (shifted != w) { result = std::max(result, CommonPrefixLength(w, shifted)); if (result * 6 >= w.size()) { return result; } shifted.CyclicLeftShift(); } auto inverse = w.Inverse(); if (LeastCyclicShift(inverse) == LeastCyclicShift(w)) { return result; } result = std::max(result, CommonPrefixLength(w, inverse)); inverse.CyclicLeftShift(); while (inverse != w.Inverse()) { result = std::max(result, CommonPrefixLength(w, inverse)); if (result * 6 >= w.size()) { return result; } inverse.CyclicLeftShift(); } return result; }
void CMultiWordCreator::AddQuoteMultiWord(CWordSequence& ws, const TArticleRef& article) { SWordHomonymNum wh; Wtroka str; CWord* pNewWord = GetWordForMultiWord(ws, str, wh); if (pNewWord->m_SourceWords.Size() == 1 && pNewWord->HasOnlyUnknownPOS()) { size_t firstId = pNewWord->IterHomonyms().GetID(); CHomonym& h = pNewWord->GetRusHomonym(firstId); h.SetSourceWordSequence(&ws); h.PutArticle(article); wh.m_HomNum = firstId; } else { pNewWord->m_SourceWords.SetPair(ws.FirstWord(), ws.LastWord()); if (str.size() == 0) str = pNewWord->m_txt; TMorph::ToLower(str); CHomonym* pNewHom = new CHomonym(TMorph::GetMainLanguage(), str); pNewHom->SetSourceWordSequence(&ws); pNewHom->PutArticle(article); wh.m_HomNum = pNewWord->AddRusHomonym(pNewHom); } if (article.AuxDic().IsValid()) { const article_t* pArt = GlobalDictsHolder->GetAuxArticle(article.AuxDic()); YASSERT(pArt != NULL); AddFoundArticle(pArt->get_kw_type(), pArt->get_title(), wh); } else { YASSERT(!article.Gzt().Empty()); AddFoundArticle(article.Gzt().GetType(), article.Gzt().GetTitle(), wh); } m_wordSequences.push_back(&ws); }
bool CAsmFile::ParseBlockCMT() { // 扫描注释. SetCurLine(0); CWord* pWord = NULL; bool bCmt = false; do { // 寻找 / * while ((pWord = GetAllNextWord()) != NULL) { string& str = pWord->GetWord(); if (str == "/*") { bCmt = true; pWord->SetAttrib(EWORD_COMMENT); break; } } // 标记注释和结束符 while ((pWord = GetAllNextWord()) != NULL) { string& str = pWord->GetWord(); pWord->SetAttrib(EWORD_COMMENT); if (str == "*/") { bCmt = false; break; } } } while (pWord != NULL); return (bCmt == false); }
inline Wtroka ExtractText(const CWord& word, const CWordVector& allWords) { Wtroka res = word.IsMultiWord() ? allWords.ToString(word) : word.GetText(); if (word.IsFirstInSentence() && !word.HasAtLeastTwoUpper()) { // Remove title-case for the first word of a sentence (not having other upper-cased letters) TMorph::ToLower(res); } return res; }
CWord::size_type CommonPrefixLength(CWord u, CWord v) { CWord::size_type result = 0; while (u.GetFront() == v.GetFront()) { ++result; u.PopFront(); v.PopFront(); } return result; }
CWord ConjugationInverseNormalForm(const CWord& w) { assert(w.Empty() || w.GetBack().Inverse() != w.GetFront()); auto min_w = LeastCyclicPermutation(w); auto min_i = LeastCyclicPermutation(w.Inverse()); if (min_w < min_i) { return min_w; } else { return min_i; } }
bool CAnalyticFormBuilder::HasDeclinableSynNounInInstrumentalis(const CWord& _W) const { for (CWord::SHomIt it = _W.IterHomonyms(); it.Ok(); ++it) if (IsSynNoun(*it) && it->HasGrammem(gInstrumental) && !it->HasGrammem(gNominative)) return true; return false; }
bool CAnalyticFormBuilder::HasShortParticipleOrAdj(const CWord& _W) const { for (CWord::SHomIt it = _W.IterHomonyms(); it.Ok(); ++it) if (it->IsShortAdjectiveOrParticiple()) return true; return false; }
bool CWord::HasCommonHomonyms(const CWord& w) const { if (!IsDictionary() || !w.IsDictionary()) { SHomIt it1 = IterHomonyms(); SHomIt it2 = w.IterHomonyms(); if (it1.Ok() && it2.Ok() && TMorph::IdentifSimilar(it1->Lemma, it2->Lemma)) return true; } for (SHomIt it1 = IterHomonyms(); it1.Ok(); ++it1) for (SHomIt it2 = w.IterHomonyms(); it2.Ok(); ++it2) if (it1->Lemma == it2->Lemma) return true; return false; }
bool CAnalyticFormBuilder::HasCompar(const CWord& W) { for (CWord::SHomIt it = W.IterHomonyms(); it.Ok(); ++it) if (it->IsFullAdjective() && it->HasGrammem(gComparative)) return true; return false; }
bool CAnalyticFormBuilder::HasInfinitive(const CWord& W) { for (CWord::SHomIt it = W.IterHomonyms(); it.Ok(); ++it) if (it->HasGrammem(gInfinitive)) return true; return false; }
Stroka CSentence::ToHTMLColorString(const yvector<CWordsPair>& wp, const Stroka& sColor, ECharset encoding) const { Stroka str; for (size_t i = 0; i + 1 <= m_words.size(); ++i) { CWord* pW = m_words[i].Get(); for (size_t j = 0; j < wp.size(); ++j) if (wp[j].FirstWord() == (int)i) str += Substitute("<font color=\"$0\">", sColor); if (i > 0 && (!pW->IsPunct() || pW->IsOpenBracket() || pW->IsCloseBracket())) str += " "; str += NStr::Encode(pW->GetOriginalText(), encoding); for (size_t j = 0; j < wp.size(); ++j) if (wp[j].LastWord() == (int)i) str += "</font>"; } return StripString(str); }
bool CAnalyticFormBuilder::HasPredik(const CWord& W) { for (CWord::SHomIt it = W.IterHomonyms(); it.Ok(); ++it) if (!it->Grammems.IsIndeclinable()) // "бух","спасибо" не строятся с анал. формами if (it->HasGrammem(gPraedic)) return true; return false; }
bool CAsmFile::ParseKeyword() { SetCurLine(0); CWord* pWord = NULL; do { // bool bCmt = false; while ((pWord = GetLineNextWord()) != NULL) { if (pWord->GetAttrib() == EWORD_INIT) { pWord->SetAttrib(EWORD_KEYWORD); break; } } } while (GetNextLine()); return true; }
unsigned long long TARGET_LANGUAGE::CTagger::getPossibleTagsForWord( const CWord &word ) { static unsigned long long possible_tags; possible_tags = m_TagDict.lookup(word); if (possible_tags==0) possible_tags = ~0L; #ifdef _ENGLISH_TAGS_H possible_tags |= getPossibleTagsBySuffix( word.str() ); possible_tags |= PENN_TAG_MUST_SEE ; #endif assert(possible_tags!=0); return possible_tags; }
// return found (or newly made) homonym id static int FindOrMakeMultiwordHomonym(const CWordSequence& ws, CWord& word, TKeyWordType kwtype, THomonymGrammems grammems, THomonymPtr& res) { // use the content of CWordSequence.m_Lemmas as new homonym lemma Wtroka str = ws.GetLemma(); if (!str) str = word.GetLowerText(); TMorph::ToLower(str); int homId = -1; // search if we already have a multiword homonym with such text if (!FindBestHomonym(word, str, kwtype, grammems, homId)) { // create new homonym, if there is no ready one res = new CHomonym(TMorph::GetMainLanguage(), str); if (ws.HasAuxArticle()) res->PutAuxArticle(ws.GetAuxArticleIndex()); else res->PutGztArticle(ws.GetGztArticle()); homId = word.AddRusHomonym(res); } return homId; }
bool CAsmFile::ParseLineCMT() { SetCurLine(0); CWord* pWord = NULL; do { bool bCmt = false; while ((pWord = GetLineNextWord()) != NULL) { string& str = pWord->GetWord(); if (str == "@") { bCmt = true; } if (bCmt) { pWord->SetAttrib(EWORD_COMMENT); } } } while (GetNextLine()); SetCurLine(0); do { bool bCmt = false; while ((pWord = GetLineNextWord()) != NULL) { string& str = pWord->GetWord(); if (str == "//") { bCmt = true; } if (bCmt) { pWord->SetAttrib(EWORD_COMMENT); } } } while (GetNextLine()); return true; }
CWord::size_type LongestPiece(const CWord& w) { auto shifted = w; CWord::size_type result = 1; do { result = std::max(result, LongestPieceFixedStart(shifted)); if (result * 6 >= w.size()) { return result; } shifted.CyclicLeftShift(); } while (shifted != w); return result; }
bool CAnalyticFormBuilder::AllHomonymsArePredicates(const CWord& W) const { const TGramBitSet VerbGerundPraedic(gVerb, gGerund, gPraedic); for (CWord::SHomIt it = W.IterHomonyms(); it.Ok(); ++it) { if (it->HasAnyOfPOS(VerbGerundPraedic) || it->IsShortAdjectiveOrParticiple()) continue; //verbs that can not be predicates or an.f. if (!TAnalyticFormPredicates::Has(it->Lemma)) return false; } return true; }
std::tuple<unsigned short, unsigned short, unsigned short> LongestCommonSubwordCyclic(CWord u, CWord v) { unsigned short max_common_prefix = 0; unsigned short u_max_begin = u.size(); unsigned short v_max_begin = v.size(); for (unsigned short current_u_begin = 0; current_u_begin < u.size(); ++current_u_begin) { for (unsigned short current_v_begin = 0; current_v_begin < v.size(); ++current_v_begin) { auto u_copy = u; auto v_copy = v; unsigned short current_common_prefix_length = 0; while (u_copy.GetFront() == v_copy.GetFront()) { ++current_common_prefix_length; u_copy.PopFront(); if (u_copy.Empty()) { break; } v_copy.PopFront(); if (v_copy.Empty()) { break; } } if (current_common_prefix_length > max_common_prefix) { u_max_begin = current_u_begin; v_max_begin = current_v_begin; max_common_prefix = current_common_prefix_length; } v.CyclicLeftShift(); } u.CyclicLeftShift(); } return std::make_tuple(u_max_begin, v_max_begin, max_common_prefix); }
bool CAnalyticFormBuilder::HasAnalyticalBe(const CWord& _W) const { //NB! в настоящем варианте парсера нет словаря оборотов, поэтому проверка слова на оборот временно отключена // если мы попали на оборот(например, "может быть"), тогда не будем строить здесь анал. форму. //if (_W.IsInOborot()) return false; // "быто" предсказывается как "быть" //if (_W.m_bPredicted) return false; //NB! пока предсказание отсутствует в парсере //if ( _W.HasPOS(UnknownPOS) ) return false; if (_W.HasUnknownPOS()) return false; for (CWord::SHomIt it = _W.IterHomonyms(); it.Ok(); ++it) { bool is_verb = it->IsPersonalVerb(); if (is_verb && !it->IsPresentTense() && it->Lemma == kByt) return true; if ((is_verb || it->HasGrammem(gInfinitive)) && it->Lemma == kStat) return true; } return false; }
void CFeatureHandle::updateLocalFeatureVector(SCORE_UPDATE method, const CStringVector* outout, int index, int round) { // abstd::cout words CWord word = outout->at(index); CWord last_word = index>0 ? outout->at(index-1) : g_emptyWord; CTwoWords two_word; two_word.allocate(word.str(), last_word.str()); CStringVector chars; chars.clear(); getCharactersFromUTF8String(word.str(), &chars); // abstd::cout length int length = getUTF8StringLength(word.str()); if (length > LENGTH_MAX-1) length = LENGTH_MAX-1; int last_length = getUTF8StringLength(last_word.str()); if (last_length > LENGTH_MAX-1) last_length = LENGTH_MAX-1; // abstd::cout chars CWord first_char = getFirstCharFromUTF8String(word.str()); CWord last_char = getLastCharFromUTF8String(word.str()); CWord first_char_last_word = index>0 ? getFirstCharFromUTF8String(last_word.str()) : g_emptyWord; CWord last_char_last_word = index>0 ? getLastCharFromUTF8String(last_word.str()) : g_emptyWord; CWord two_char = index>0 ? last_char_last_word.str() + first_char.str() : g_emptyWord; CTwoWords first_and_last_char, lastword_firstchar, currentword_lastchar, firstcharlastword_word, lastword_lastchar; first_and_last_char.allocate(first_char.str(), last_char.str()); if (index>0) { lastword_firstchar.allocate(last_word.str(), first_char.str()); currentword_lastchar.allocate(word.str(), last_char_last_word.str()); firstcharlastword_word.allocate(first_char_last_word.str(), first_char.str()); lastword_lastchar.allocate(last_char_last_word.str(), last_char.str()); } SCORE_TYPE amount = ( (method==eAdd) ? 1 : -1 ) ; m_weights.m_mapSeenWords.updateScore(word, amount, round); m_weights.m_mapLastWordByWord.updateScore(two_word, amount, round); if (length==1) m_weights.m_mapOneCharWord.updateScore(first_char, amount, round); else { m_weights.m_mapFirstAndLastChars.updateScore(first_and_last_char, amount, round); for (int j=0; j<chars.size()-1; j++) { m_weights.m_mapConsecutiveChars.updateScore(chars[j]+chars[j+1], amount, round); } m_weights.m_mapLengthByFirstChar.updateScore(std::make_pair(first_char, length), amount, round); m_weights.m_mapLengthByLastChar.updateScore(std::make_pair(last_char, length), amount, round); } if (index>0) { m_weights.m_mapSeparateChars.updateScore(two_char, amount, round); m_weights.m_mapLastWordFirstChar.updateScore(lastword_firstchar, amount, round); m_weights.m_mapCurrentWordLastChar.updateScore(currentword_lastchar, amount, round); m_weights.m_mapFirstCharLastWordByWord.updateScore(firstcharlastword_word, amount, round); m_weights.m_mapLastWordByLastChar.updateScore(lastword_lastchar, amount, round); m_weights.m_mapLengthByLastWord.updateScore(std::make_pair(last_word, length), amount, round); m_weights.m_mapLastLengthByWord.updateScore(std::make_pair(word, last_length), amount, round); } }
static bool FindBestHomonym(const CWord& word, TWtringBuf lemma, TKeyWordType kwtype, THomonymGrammems grammems, int& homId) { // search if we already have a multiword homonym with such text bool found = false; bool foundSameKwType = false; for (CWord::SHomIt it = word.IterHomonyms(); it.Ok(); ++it) if (it->CHomonymBase::GetLemma() == lemma) { if (it->HasKWType(kwtype, KW_DICT)) { if (it->Grammems == grammems) { homId = it.GetID(); // prefer homonyms with same kwtype and same grammems return true; } else if (!foundSameKwType) { homId = it.GetID(); // prefer homonyms with same kwtype found = true; foundSameKwType = true; } } else if (!found) { homId = it.GetID(); // otherwise, return the first one having @lemma text found = true; } } return found; }
void CTagger :: updateLocalFeatureVector( SCORE_UPDATE method , const CTwoStringVector * sentence , unsigned long index , unsigned long round ) { // abstd::cout words CWord word = sentence->at( index ).first ; CWord last_word = index > 0 ? sentence->at( index - 1 ).first : g_emptyWord ; CWord next_word = index < sentence->size() - 1 ? sentence->at( index + 1 ).first : g_emptyWord ; CStringVector chars , last_chars ; chars.clear() ; getCharactersFromUTF8String( sentence->at(index).first , &chars ) ; last_chars.clear() ; if ( index > 0 ) getCharactersFromUTF8String( sentence->at( index - 1 ).first , &last_chars ) ; // abstd::cout length int length = chars.size() ; //if ( length > LENGTH_MAX-1 ) length = LENGTH_MAX-1 ; int last_length = last_chars.size() ; //if ( last_length > LENGTH_MAX-1 ) last_length = LENGTH_MAX-1 ; // abstd::cout chars CWord first_char = chars[ 0 ]; CWord last_char = chars[ chars.size() - 1 ]; CWord first_char_last_word = index > 0 ? last_chars[ 0 ] : g_emptyWord; CWord last_char_last_word = index > 0 ? last_chars[ last_chars.size() - 1 ] : g_emptyWord; CWord first_char_next_word = index + 1 < sentence->size() ? getFirstCharFromUTF8String( sentence->at( index + 1 ).first ) : g_emptyWord ; CWord last_twochar_last_word = last_chars.size() > 1 ? last_chars[ last_chars.size() - 2 ] + last_chars[ last_chars.size() - 1] : ( index > 1 ? getLastCharFromUTF8String(sentence->at(index-2).first) + last_chars[ 0 ] : g_emptyWord ); CWord first_twochar = chars.size() > 1 ? chars[ 0 ] + chars [ 1 ] : ( index + 1 <sentence->size() ? chars[ 0 ] + getFirstCharFromUTF8String( sentence->at( index + 1 ).first ) : g_emptyWord ); CWord currentword_lasttwochar = index > 1 ? last_twochar_last_word.str() + word.str() : g_emptyWord ; CWord lastword_firsttwochar = index > 0 && index+1 < sentence->size() ? last_word.str() + first_twochar.str() : g_emptyWord ; CWord two_char = index > 0 ? last_char_last_word.str() + first_char.str() : g_emptyWord ; CWord lastword_firstchar = index > 0 ? last_word.str() + first_char.str() : g_emptyWord ; CWord currentword_lastchar = index > 0 ? last_char_last_word.str() + word.str() : g_emptyWord ; CWord three_char = length == 1 ? last_char_last_word.str() + word.str() + first_char_next_word.str() : g_emptyWord ; CTwoWords two_word ; // abstd::cout tags const CTag tag( sentence->at(index).second ) ; const CTag last_tag = index > 0 ? CTag( sentence->at( index-1 ).second) : CTag::SENTENCE_BEGIN ; const CTag second_last_tag = index > 1 ? CTag( sentence->at( index-2 ).second) : CTag::SENTENCE_BEGIN ; const CTagSet<CTag, 2> tag_bigram(encodeTags(tag, last_tag)); const CTagSet<CTag, 3> tag_trigram(encodeTags(tag, last_tag, second_last_tag)); CTaggedWord<CTag, TAG_SEPARATOR> wt1, wt2; CTwoTaggedWords wt12; // abstd::cout the char categories long int first_char_cat = m_weights->m_mapCharTagDictionary.lookup(first_char) | (1<<tag.code()) ; long int last_char_cat = m_weights->m_mapCharTagDictionary.lookup(last_char) | (1<<tag.code()) ; SCORE_TYPE amount = method == eAdd ? 1 : -1 ; m_weights->m_mapCurrentTag[ std::make_pair(word, tag) ].updateCurrent( amount , round ) ; m_weights->m_mapLastTagByTag[ tag_bigram ].updateCurrent( amount , round ) ; m_weights->m_mapLastTwoTagsByTag[ tag_trigram ].updateCurrent( amount , round ) ; if ( index > 0 ) { if ( last_length <= 2 ) m_weights->m_mapTagByLastWord[ std::make_pair(last_word, tag) ].updateCurrent( amount , round ) ; if ( length <= 2 ) m_weights->m_mapLastTagByWord[ std::make_pair(word, last_tag) ].updateCurrent( amount , round ) ; if ( length <= 2 ) m_weights->m_mapTagByWordAndPrevChar[ std::make_pair(currentword_lastchar, tag) ].updateCurrent( amount , round ) ; if ( last_length <= 2 ) m_weights->m_mapTagByWordAndNextChar[ std::make_pair(lastword_firstchar, last_tag) ].updateCurrent( amount , round ) ; } if ( length == 1 ) { if ( index > 0 && index < sentence->size() - 1 ) m_weights->m_mapTagOfOneCharWord[ std::make_pair(three_char, tag) ].updateCurrent( amount , round ) ; } else { m_weights->m_mapTagByFirstChar[ std::make_pair(first_char, tag) ].updateCurrent( amount , round ) ; m_weights->m_mapTagByLastChar[ std::make_pair(last_char, tag) ].updateCurrent( amount , round ) ; // m_weights->m_mapTagByFirstCharCat[ std::make_pair(first_char_cat, tag) ].updateCurrent( amount , round ) ; m_weights->m_mapTagByLastCharCat[ std::make_pair(last_char_cat, tag) ].updateCurrent( amount , round ) ; for ( int j = 0 ; j < chars.size() ; ++ j ) { if ( j > 0 && j < chars.size() - 1 ) m_weights->m_mapTagByChar[ std::make_pair(CWord(chars[j]), tag) ].updateCurrent( amount , round ) ; if ( j > 0 ) { wt1.load(chars[j], tag); wt2.load(first_char); wt12.allocate(wt1, wt2); m_weights->m_mapTaggedCharByFirstChar[ wt12 ].updateCurrent( amount , round ) ; if ( chars[j] == chars[j-1] ) m_weights->m_mapRepeatedCharByTag[ std::make_pair(CWord(chars[j]), tag) ].updateCurrent( amount , round ) ; // } if (j<chars.size()-1) { wt1.load(chars[j], tag); wt2.load(last_char); wt12.allocate(wt1, wt2); m_weights->m_mapTaggedCharByLastChar[ wt12 ].updateCurrent(amount, round); } } } }
///////////////////////////////////////////////////////////////////////////// // CFString::ExtractWordsFromString void CFString::ExtractWordsFromString( CSTR_line* Comingline , PageElementCount* Count) { CSTR_rast_attr rast_attr; CSTR_rast rast; RecVersions vers; CSTR_line line; CSTR_attr line_attr; CWord* CurrentWord; const char* SeparatorsWord=" "; const char* result=NULL; int16_t FlagWord; Bool FlagString,FlagCapDrop; line = *Comingline; FlagWord = 0; FlagString = FALSE; FlagCapDrop = FALSE; CSTR_GetLineAttr( line , &line_attr ); if(line_attr.Flags & CSTR_STR_CapDrop) //буквица - см. первая буква в сказках FlagCapDrop = TRUE; SetRect(&m_rectBaseLine, line_attr.bs1, line_attr.bs2, line_attr.bs3, line_attr.bs4); //don't used now SetRect(&m_rectString, line_attr.col - TemplateOffset.x, line_attr.row - TemplateOffset.y, line_attr.col - TemplateOffset.x + line_attr.wid, line_attr.row - TemplateOffset.y + line_attr.hei); #ifdef alDebug //obsolete option { RECT rect; SetRect(&rect,line_attr.col, line_attr.row, line_attr.col + line_attr.wid, line_attr.row + line_attr.hei); pInputArray->push_back(rect); } #endif rast = CSTR_GetFirstRaster( line ); // I фикт. растр rast = CSTR_GetNextRaster( rast , CSTR_f_all ); while( rast ) { CSTR_GetCollection( rast,&vers ); CSTR_GetAttr( rast,&rast_attr ); //#define NEG_HALF_SPACE 0x1e //#define POS_HALF_SPACE 0x1f //#define REGULAR_SPACE 0x20 if((rast_attr.flg&CSTR_f_dust) || (rast_attr.flg&CSTR_f_fict ) ||//Дусты и ((rast_attr.flg&CSTR_f_space) && (!vers.Alt[0].Code || vers.Alt[0].Code == 0x1e || vers.Alt[0].Code == 0x1f)))//полупробелы выкидываем. goto next_raster; FlagString = TRUE; result = strchr( SeparatorsWord, vers.Alt[0].Code ); if( FlagWord ){ // word openly if(result) // word is broken FlagWord =0; else{ // word lasts assert( CurrentWord ); CurrentWord->AddLetter2Word( &rast , Count ,&FlagCapDrop ); } } else{ // expect new word if(!result){ // beginning word ++m_wWordsCount; ++Count->Words; FlagWord = 1; m_arWords.push_back( new CWord() ); CurrentWord = m_arWords[m_wWordsCount-1]; assert( CurrentWord ); if(rast_attr.font_spec == CSTR_fs_courier) CurrentWord->m_wFontNumber = rast_attr.font & 0xFC; else CurrentWord->m_wFontNumber = rast_attr.font; CurrentWord->m_wFontPointSize = rast_attr.keg; CurrentWord->AddLetter2Word( &rast , Count ,&FlagCapDrop ); if(FlagCapDrop){ CurrentWord->m_wFontPointSize = 14; FlagCapDrop = FALSE; FlagWord = 0; } } } next_raster: rast = CSTR_GetNextRaster( rast , CSTR_f_all ); } if(FlagString == FALSE) Count->Strings--; }
SWordHomonymNum CMultiWordCreator::AddMultiWordInt(CWordSequence* ws, bool takeOnwership, const TGramBitSet& newPos, const CWordsPair& searchAreaWP) { SWordHomonymNum wh = ws->GetMainWord(); Wtroka stmp; SWordHomonymNum newWH; CWord* pNewWord = GetWordForMultiWord(*ws, stmp, newWH); pNewWord->m_SourceWords.SetPair(ws->FirstWord(), ws->LastWord()); TGramBitSet art_grammems; // output grammems of article Wtroka article_title; TKeyWordType article_type = NULL; if (ws->HasGztArticle()) { const TGztArticle& gzt_article = ws->GetGztArticle(); article_title = gzt_article.GetTitle(); article_type = gzt_article.GetType(); const NGzt::TMessage* lemma = gzt_article.GetLemmaInfo(); if (lemma != NULL) art_grammems = gzt_article.GetLemmaOutputGrammems(*lemma); } else if (ws->HasAuxArticle()) { const article_t* pArt = GlobalDictsHolder->GetAuxArticle(ws->GetAuxArticleIndex()); art_grammems = pArt->get_new_pos(); article_title = pArt->get_title(); article_type = pArt->get_kw_type(); } THomonymGrammems newGram; if (!ws->GetGrammems().Empty()) { newGram = ws->GetGrammems(); if (!newGram.HasForms() && wh.IsValid()) newGram.SetPOS(m_Words[wh].Grammems.GetPOS()); } else if (wh.IsValid() && HasToAddGrammemsFromMainWord(*ws)) newGram = m_Words[wh].Grammems; MergeGrammems(newGram, art_grammems, newPos); THomonymPtr pNewHom; if (pNewWord->IsMultiWord() && (pNewWord->GetSourcePair().Size() != 1 || !wh.IsValid())) { newWH.m_HomNum = FindOrMakeMultiwordHomonym(*ws, *pNewWord, article_type, newGram, pNewHom); YASSERT(newWH.IsValid()); } if (pNewHom.Get() == NULL) { if (!pNewWord->IsMultiWord()) { if (wh.IsValid()) newWH = wh; else { // just take the first homonym newWH.m_bOriginalWord = true; newWH.m_WordNum = pNewWord->GetSourcePair().FirstWord(); newWH.m_HomNum = pNewWord->IterHomonyms().GetID(); } } YASSERT(newWH.IsValid()); //часто бывает ситуация, когда мы вынуждены клонировать абсолютно одинаковые //омонимы, различающиеся только приписанными статьями из aux_dic, //в случае с geo_thesaurus.cxx это чревато порождением огромного количества омонимов //(боле 50 для "Петров"), тогда если статьи не отличаются друг от друга полем СОСТАВ //приписываемыми граммемами, ЧР и KWType, то мы омонимы не клонируем а дополнительные статьи //записываем в CHomonym::m_KWtype2Articles. Это происходит в CWord::PutArticleIndex. //если мы считаем, что найденные статьи для одного и того же омонима ничем не отличаются, //то главное слово для неотличающихся стаей у ws одно и то же и ему приписана //первая попавшаяся среди неразличимы статья //например статьи "_петрова_2" и "_петрова_3" для нас одинаковы (отличаются только ГЕО_ЧАСТЬ //а это неважно для парсера) и незачем плодить омонимы bool bCloneAnyway = (!newGram.Empty() && !(m_Words[newWH].Grammems == newGram)) || !GlobalDictsHolder->BuiltinKWTypes().IsGeo(article_type); if (ws->HasAuxArticle()) newWH.m_HomNum = m_Words.GetWord(newWH).PutAuxArticle(newWH.m_HomNum, ws->GetAuxArticleIndex(), bCloneAnyway); else newWH.m_HomNum = m_Words.GetWord(newWH).PutGztArticle(newWH.m_HomNum, ws->GetGztArticle(), bCloneAnyway); } YASSERT(newWH.IsValid()); AddFoundArticle(article_type, article_title, newWH, searchAreaWP); CHomonym& h = m_Words[newWH]; h.SetSourceWordSequence(ws); if (!newGram.Empty()) h.SetGrammems(newGram); if (takeOnwership) { if (!ws->HasLemmas()) NormalizeMultiWordHomonym(pNewWord, &h); m_wordSequences.push_back(ws); } return newWH; }