//Try to predict double-word surname if the last part is found in dictionary. //Then check if the first part is found in dictionary as surname. //If it is not then try predicting it. bool CWord::PredictHyphenSurname() { if (!((m_typ == Hyphen || m_typ == HypDiv || m_typ == DivWord) && m_variant.size() > 0)) return false; size_t ii = m_txt.find('-'); if (ii == Wtroka::npos) return false; //only one hyphen is allowed if (ii != m_txt.rfind('-')) return false; Wtroka strRightPart = m_txt.substr(ii + 1); if (strRightPart.size() < 1 || !::IsUpper(strRightPart[0])) return false; int iH = -1; THomonymGrammems rightPartGrammems; Wtroka strRightPartLemma; if (!RightPartIsSurname(iH, rightPartGrammems, strRightPartLemma)) return false; ii = m_txt.find('-'); //unnecessary call? if (ii == Wtroka::npos) return false; Wtroka strFirstPart = m_txt.substr(0, ii); TMorph::ToLower(strFirstPart); //look in morphology THomonymVector res; TMorph::GetDictHomonyms(strFirstPart, res); bool found = false; for (size_t i = 0; i < res.size(); ++i) { if (!found && res[i]->HasGrammem(gSurname) && NGleiche::Gleiche(res[i]->Grammems, rightPartGrammems, NGleiche::GenderNumberCaseCheck)) { found = true; Wtroka joined_lemma = res[i]->GetLemma() + '-' + strRightPartLemma; AddHyphenSurnameLemma(iH, rightPartGrammems, joined_lemma); } } if (found) return true; //if the word was in morphology then do not do any further predictions if (res.size() > 0) return false; yvector<TSurnamePredictor::TPredictedSurname> out; TMorph::PredictSurname(strFirstPart, out); if (out.size() > 0 && NGleiche::Gleiche(out[0].FlexGrammars, rightPartGrammems.Forms(), NGleiche::GenderNumberCaseCheck)) { Wtroka joined_lemma = out[0].Lemma + '-' + strRightPartLemma; AddHyphenSurnameLemma(iH, rightPartGrammems, joined_lemma); return true; } return false; }
bool CWord::RightPartIsSurname(int& iH, THomonymGrammems& grammems, Wtroka& strLemma) { iH = HasMorphNounWithGrammems_i(TGramBitSet(gSurname)); if (iH != -1) { CHomonym& h = GetRusHomonym(iH); grammems = h.Grammems; strLemma = h.GetLemma(); size_t ii = strLemma.find('-'); YASSERT(ii != Wtroka::npos); strLemma = strLemma.substr(ii + 1); return true; } //if this word is in morphology - do not try to predict if (IsDictionary()) return false; size_t ii = m_txt.find('-'); if (ii == Wtroka::npos) return false; Wtroka strRightPart = m_txt.substr(ii + 1); TMorph::ToLower(strRightPart); yvector<TSurnamePredictor::TPredictedSurname> out; if (!TMorph::PredictSurname(strRightPart, out)) return false; TGrammarBunch newForms; NSpike::ToGrammarBunch(out[0].StemGrammar, out[0].FlexGrammars, newForms); grammems.Reset(newForms); strLemma = out[0].Lemma; return true; }
bool THomonymInflector::FindInForms(const THomonymGrammems& forms, const TGramBitSet& grammems, TGramBitSet& resgram) { using NInfl::DefaultFeatures; using NInfl::TFeature; if (forms.HasForms()) { for (THomonymGrammems::TFormIter it = forms.IterForms(); it.Ok(); ++it) if (it->HasAll(grammems)) { resgram = *it; return true; } } else if (forms.IsIndeclinable() && DefaultFeatures().BitSet(TFeature::Case, TFeature::Number).HasAll(grammems)) { resgram = grammems; return true; } return false; }
void CSentence::AddFIOWS(const CFIOOccurence& fioOccurence, const SFullFIO& fio, int iSimilarOccurencesCount) { TIntrusivePtr<CFioWordSequence> fioWS(new CFioWordSequence(fio)); *(CFIOOccurence*)(fioWS.Get()) = fioOccurence; fioWS->PutWSType(FioWS); if (fio.m_Genders.any()) { THomonymGrammems gram = fioWS->GetGrammems(); gram.Replace(NSpike::AllGenders, fio.m_Genders); fioWS->SetGrammems(gram); } fioWS->m_iSimilarOccurencesCount = iSimilarOccurencesCount; bool isManualFio = true; SWordHomonymNum wh = fioOccurence.m_NameMembers[Surname]; if (wh.IsValid()) if (m_Words.GetWord(wh).IsMultiWord()) isManualFio = false; if (!fio.m_bFoundSurname && fioOccurence.m_NameMembers[Surname].IsValid() && !(fioOccurence.m_NameMembers[FirstName].IsValid() || fioOccurence.m_NameMembers[InitialName].IsValid()) && isManualFio) { CNameFinder nameFinder(m_Words); //если не смогли среди предсказанных фамилий //найти совпадающую с фамилией из fio, то вываливаемся if (!nameFinder.PredictSingleSurname(*fioWS, fio)) return; } fioWS->ClearLemmas(); if (!fio.m_strSurname.empty()) { Wtroka capLemma; if (fioOccurence.m_NameMembers[Surname].IsValid()) { const CWord& w = m_Words.GetWord(fioOccurence.m_NameMembers[Surname]); capLemma = GetCapitalizedLemma(w, -1, fio.m_strSurname); } else { capLemma = fio.m_strSurname; NStr::ToFirstUpper(capLemma); } fioWS->AddLemma(SWordSequenceLemma(fio.m_strSurname, capLemma)); } if (!fio.m_strName.empty()) { Wtroka capLemma; if (fioOccurence.m_NameMembers[FirstName].IsValid()) { const CWord& w = m_Words.GetWord(fioOccurence.m_NameMembers[FirstName]); capLemma = GetCapitalizedLemma(w, -1, fio.m_strName); } else if (fioOccurence.m_NameMembers[InitialName].IsValid()) { const CWord& w = m_Words.GetWord(fioOccurence.m_NameMembers[InitialName]); capLemma = GetCapitalizedLemma(w, -1, fio.m_strName); } else { capLemma = fio.m_strName; TMorph::ToTitle(capLemma); } fioWS->AddLemma(SWordSequenceLemma(fio.m_strName, capLemma)); } if (!fio.m_strPatronomyc.empty()) { Wtroka capLemma; if (fioOccurence.m_NameMembers[Patronomyc].IsValid()) { const CWord& w = m_Words.GetWord(fioOccurence.m_NameMembers[Patronomyc]); capLemma = GetCapitalizedLemma(w, -1, fio.m_strPatronomyc); } else if (fioOccurence.m_NameMembers[InitialPatronomyc].IsValid()) { const CWord& w = m_Words.GetWord(fioOccurence.m_NameMembers[InitialPatronomyc]); capLemma = GetCapitalizedLemma(w, -1, fio.m_strPatronomyc); } else { capLemma = fio.m_strPatronomyc; TMorph::ToTitle(capLemma); } fioWS->AddLemma(SWordSequenceLemma(fio.m_strPatronomyc, capLemma)); } TakeFioWS(fioWS); }
SWordHomonymNum CMultiWordCreator::AddMultiWordInt(CWordSequence* ws, bool takeOnwership, const TGramBitSet& newPos, const CWordsPair& searchAreaWP) { SWordHomonymNum wh = ws->GetMainWord(); Wtroka stmp; SWordHomonymNum newWH; CWord* pNewWord = GetWordForMultiWord(*ws, stmp, newWH); pNewWord->m_SourceWords.SetPair(ws->FirstWord(), ws->LastWord()); TGramBitSet art_grammems; // output grammems of article Wtroka article_title; TKeyWordType article_type = NULL; if (ws->HasGztArticle()) { const TGztArticle& gzt_article = ws->GetGztArticle(); article_title = gzt_article.GetTitle(); article_type = gzt_article.GetType(); const NGzt::TMessage* lemma = gzt_article.GetLemmaInfo(); if (lemma != NULL) art_grammems = gzt_article.GetLemmaOutputGrammems(*lemma); } else if (ws->HasAuxArticle()) { const article_t* pArt = GlobalDictsHolder->GetAuxArticle(ws->GetAuxArticleIndex()); art_grammems = pArt->get_new_pos(); article_title = pArt->get_title(); article_type = pArt->get_kw_type(); } THomonymGrammems newGram; if (!ws->GetGrammems().Empty()) { newGram = ws->GetGrammems(); if (!newGram.HasForms() && wh.IsValid()) newGram.SetPOS(m_Words[wh].Grammems.GetPOS()); } else if (wh.IsValid() && HasToAddGrammemsFromMainWord(*ws)) newGram = m_Words[wh].Grammems; MergeGrammems(newGram, art_grammems, newPos); THomonymPtr pNewHom; if (pNewWord->IsMultiWord() && (pNewWord->GetSourcePair().Size() != 1 || !wh.IsValid())) { newWH.m_HomNum = FindOrMakeMultiwordHomonym(*ws, *pNewWord, article_type, newGram, pNewHom); YASSERT(newWH.IsValid()); } if (pNewHom.Get() == NULL) { if (!pNewWord->IsMultiWord()) { if (wh.IsValid()) newWH = wh; else { // just take the first homonym newWH.m_bOriginalWord = true; newWH.m_WordNum = pNewWord->GetSourcePair().FirstWord(); newWH.m_HomNum = pNewWord->IterHomonyms().GetID(); } } YASSERT(newWH.IsValid()); //часто бывает ситуация, когда мы вынуждены клонировать абсолютно одинаковые //омонимы, различающиеся только приписанными статьями из aux_dic, //в случае с geo_thesaurus.cxx это чревато порождением огромного количества омонимов //(боле 50 для "Петров"), тогда если статьи не отличаются друг от друга полем СОСТАВ //приписываемыми граммемами, ЧР и KWType, то мы омонимы не клонируем а дополнительные статьи //записываем в CHomonym::m_KWtype2Articles. Это происходит в CWord::PutArticleIndex. //если мы считаем, что найденные статьи для одного и того же омонима ничем не отличаются, //то главное слово для неотличающихся стаей у ws одно и то же и ему приписана //первая попавшаяся среди неразличимы статья //например статьи "_петрова_2" и "_петрова_3" для нас одинаковы (отличаются только ГЕО_ЧАСТЬ //а это неважно для парсера) и незачем плодить омонимы bool bCloneAnyway = (!newGram.Empty() && !(m_Words[newWH].Grammems == newGram)) || !GlobalDictsHolder->BuiltinKWTypes().IsGeo(article_type); if (ws->HasAuxArticle()) newWH.m_HomNum = m_Words.GetWord(newWH).PutAuxArticle(newWH.m_HomNum, ws->GetAuxArticleIndex(), bCloneAnyway); else newWH.m_HomNum = m_Words.GetWord(newWH).PutGztArticle(newWH.m_HomNum, ws->GetGztArticle(), bCloneAnyway); } YASSERT(newWH.IsValid()); AddFoundArticle(article_type, article_title, newWH, searchAreaWP); CHomonym& h = m_Words[newWH]; h.SetSourceWordSequence(ws); if (!newGram.Empty()) h.SetGrammems(newGram); if (takeOnwership) { if (!ws->HasLemmas()) NormalizeMultiWordHomonym(pNewWord, &h); m_wordSequences.push_back(ws); } return newWH; }
static void MergeGrammems(THomonymGrammems& dst, const TGramBitSet& art_grammems, const TGramBitSet& newPos) { // first, reset Part Of Speech if any if (newPos.HasAny(TMorph::AllPOS())) dst.SetPOS(newPos); else if (art_grammems.HasAny(TMorph::AllPOS())) dst.SetPOS(art_grammems); // take other grammems from @art_grammems if any TGramBitSet other = art_grammems & ~TMorph::AllPOS(); if (other.any()) { // if there is a form with such grammems - just leave it alone and drop the rest ones bool found = false; for (THomonymGrammems::TFormIter it = dst.IterForms(); it.Ok(); ++it) if (it->HasAll(other)) { dst.ResetSingleForm(*it); found = true; } if (!found) { // otherwise merge all forms and replace grammems by classes TGramBitSet newForm = dst.All(); newForm.ReplaceByMaskIfAny(art_grammems, NSpike::AllCases); newForm.ReplaceByMaskIfAny(art_grammems, NSpike::AllGenders); newForm.ReplaceByMaskIfAny(art_grammems, NSpike::AllNumbers); const TGramBitSet anim(gAnimated, gInanimated); newForm.ReplaceByMaskIfAny(art_grammems, anim); newForm.ReplaceByMaskIfAny(art_grammems, NSpike::AllTimes); newForm.ReplaceByMaskIfAny(art_grammems, NSpike::AllPersons); // just add the rest non-classified grammems static const TGramBitSet nonclassified = ~(NSpike::AllCases | NSpike::AllGenders | NSpike::AllNumbers | anim | NSpike::AllTimes | NSpike::AllPersons); newForm |= art_grammems & nonclassified; dst.Reset(newForm); } } // if we still do not known POS, apply some workarounds: if (dst.GetPOS().none()) { dst.SetPOS(TGramBitSet(gSubstantive)); if (!dst.HasAny(NSpike::AllCases)) dst.Add(NSpike::AllCases); if (!dst.HasAny(NSpike::AllGenders)) dst.Add(NSpike::AllGenders); if (!dst.HasAny(NSpike::AllNumbers)) dst.Add(NSpike::AllNumbers); } // set a noun or adj without additional grammem as indeclinable if (!dst.HasAny(~TMorph::AllPOS()) && (art_grammems.Has(gSubstantive) || TMorph::IsFullAdjective(art_grammems))) dst.Add(NSpike::AllCases | NSpike::AllGenders | NSpike::AllNumbers); }