Example #1
0
//Try to predict double-word surname if the last part is found in dictionary.
//Then check if the first part is found in dictionary as surname.
//If it is not then try predicting it.
bool CWord::PredictHyphenSurname()
{
    if (!((m_typ == Hyphen || m_typ == HypDiv || m_typ == DivWord) && m_variant.size() > 0))
        return false;

    size_t ii = m_txt.find('-');
    if (ii == Wtroka::npos)
        return false;
    //only one hyphen is allowed
    if (ii != m_txt.rfind('-'))
        return false;
    Wtroka strRightPart = m_txt.substr(ii + 1);
    if (strRightPart.size() < 1 || !::IsUpper(strRightPart[0]))
        return false;

    int iH = -1;
    THomonymGrammems rightPartGrammems;
    Wtroka strRightPartLemma;
    if (!RightPartIsSurname(iH, rightPartGrammems, strRightPartLemma))
        return false;

    ii = m_txt.find('-');       //unnecessary call?
    if (ii == Wtroka::npos)
        return false;
    Wtroka strFirstPart = m_txt.substr(0, ii);
    TMorph::ToLower(strFirstPart);

    //look in morphology
    THomonymVector res;
    TMorph::GetDictHomonyms(strFirstPart, res);
    bool found = false;
    for (size_t i = 0; i < res.size(); ++i) {
        if (!found && res[i]->HasGrammem(gSurname) &&
            NGleiche::Gleiche(res[i]->Grammems, rightPartGrammems, NGleiche::GenderNumberCaseCheck)) {
            found = true;
            Wtroka joined_lemma = res[i]->GetLemma() + '-' + strRightPartLemma;
            AddHyphenSurnameLemma(iH, rightPartGrammems, joined_lemma);
        }
    }

    if (found)
        return true;

    //if the word was in morphology then do not do any further predictions
    if (res.size() > 0)
        return false;

    yvector<TSurnamePredictor::TPredictedSurname> out;
    TMorph::PredictSurname(strFirstPart, out);
    if (out.size() > 0 && NGleiche::Gleiche(out[0].FlexGrammars, rightPartGrammems.Forms(), NGleiche::GenderNumberCaseCheck)) {
        Wtroka joined_lemma = out[0].Lemma + '-' + strRightPartLemma;
        AddHyphenSurnameLemma(iH, rightPartGrammems, joined_lemma);
        return true;
    }
    return false;
}
Example #2
0
bool CWord::RightPartIsSurname(int& iH, THomonymGrammems& grammems, Wtroka& strLemma)
{
    iH = HasMorphNounWithGrammems_i(TGramBitSet(gSurname));
    if (iH != -1) {
        CHomonym& h = GetRusHomonym(iH);
        grammems = h.Grammems;
        strLemma = h.GetLemma();
        size_t ii = strLemma.find('-');
        YASSERT(ii != Wtroka::npos);
        strLemma = strLemma.substr(ii + 1);
        return true;
    }

    //if this word is in morphology - do not try to predict
    if (IsDictionary())
        return false;

    size_t ii = m_txt.find('-');
    if (ii == Wtroka::npos)
        return false;
    Wtroka strRightPart = m_txt.substr(ii + 1);
    TMorph::ToLower(strRightPart);

    yvector<TSurnamePredictor::TPredictedSurname> out;
    if (!TMorph::PredictSurname(strRightPart, out))
        return false;

    TGrammarBunch newForms;
    NSpike::ToGrammarBunch(out[0].StemGrammar, out[0].FlexGrammars, newForms);
    grammems.Reset(newForms);
    strLemma = out[0].Lemma;
    return true;
}
Example #3
0
bool THomonymInflector::FindInForms(const THomonymGrammems& forms, const TGramBitSet& grammems, TGramBitSet& resgram) {
    using NInfl::DefaultFeatures;
    using NInfl::TFeature;
    if (forms.HasForms()) {
        for (THomonymGrammems::TFormIter it = forms.IterForms(); it.Ok(); ++it)
            if (it->HasAll(grammems)) {
                resgram = *it;
                return true;
            }
    } else if (forms.IsIndeclinable() && DefaultFeatures().BitSet(TFeature::Case, TFeature::Number).HasAll(grammems)) {
        resgram = grammems;
        return true;
    }

    return false;
}
Example #4
0
void CSentence::AddFIOWS(const CFIOOccurence& fioOccurence, const SFullFIO& fio, int iSimilarOccurencesCount)
{
    TIntrusivePtr<CFioWordSequence> fioWS(new CFioWordSequence(fio));
    *(CFIOOccurence*)(fioWS.Get()) = fioOccurence;
    fioWS->PutWSType(FioWS);
    if (fio.m_Genders.any()) {
        THomonymGrammems gram = fioWS->GetGrammems();
        gram.Replace(NSpike::AllGenders, fio.m_Genders);
        fioWS->SetGrammems(gram);
    }

    fioWS->m_iSimilarOccurencesCount = iSimilarOccurencesCount;

    bool isManualFio = true;

    SWordHomonymNum wh = fioOccurence.m_NameMembers[Surname];
    if (wh.IsValid())
        if (m_Words.GetWord(wh).IsMultiWord())
            isManualFio = false;

    if (!fio.m_bFoundSurname &&
        fioOccurence.m_NameMembers[Surname].IsValid() &&
        !(fioOccurence.m_NameMembers[FirstName].IsValid() ||
            fioOccurence.m_NameMembers[InitialName].IsValid()) &&
            isManualFio) {
        CNameFinder nameFinder(m_Words);
        //если не смогли среди предсказанных фамилий
        //найти совпадающую с фамилией из fio, то вываливаемся
        if (!nameFinder.PredictSingleSurname(*fioWS, fio))
            return;
    }

    fioWS->ClearLemmas();
    if (!fio.m_strSurname.empty()) {
        Wtroka capLemma;
        if (fioOccurence.m_NameMembers[Surname].IsValid()) {
            const CWord& w = m_Words.GetWord(fioOccurence.m_NameMembers[Surname]);
            capLemma = GetCapitalizedLemma(w, -1, fio.m_strSurname);
        } else {
            capLemma = fio.m_strSurname;
            NStr::ToFirstUpper(capLemma);
        }
        fioWS->AddLemma(SWordSequenceLemma(fio.m_strSurname, capLemma));
    }
    if (!fio.m_strName.empty()) {
        Wtroka capLemma;
        if (fioOccurence.m_NameMembers[FirstName].IsValid()) {
            const CWord& w = m_Words.GetWord(fioOccurence.m_NameMembers[FirstName]);
            capLemma = GetCapitalizedLemma(w, -1, fio.m_strName);
        } else if (fioOccurence.m_NameMembers[InitialName].IsValid()) {
            const CWord& w = m_Words.GetWord(fioOccurence.m_NameMembers[InitialName]);
            capLemma = GetCapitalizedLemma(w, -1, fio.m_strName);
        } else {
            capLemma = fio.m_strName;
            TMorph::ToTitle(capLemma);

        }
        fioWS->AddLemma(SWordSequenceLemma(fio.m_strName, capLemma));
    }
    if (!fio.m_strPatronomyc.empty()) {
        Wtroka capLemma;
        if (fioOccurence.m_NameMembers[Patronomyc].IsValid()) {
            const CWord& w = m_Words.GetWord(fioOccurence.m_NameMembers[Patronomyc]);
            capLemma = GetCapitalizedLemma(w, -1, fio.m_strPatronomyc);
        } else if (fioOccurence.m_NameMembers[InitialPatronomyc].IsValid()) {
            const CWord& w = m_Words.GetWord(fioOccurence.m_NameMembers[InitialPatronomyc]);
            capLemma = GetCapitalizedLemma(w, -1, fio.m_strPatronomyc);
        } else {
            capLemma = fio.m_strPatronomyc;
            TMorph::ToTitle(capLemma);
        }

        fioWS->AddLemma(SWordSequenceLemma(fio.m_strPatronomyc, capLemma));
    }
    TakeFioWS(fioWS);
}
SWordHomonymNum CMultiWordCreator::AddMultiWordInt(CWordSequence* ws, bool takeOnwership,
                                                   const TGramBitSet& newPos, const CWordsPair& searchAreaWP)
{
    SWordHomonymNum wh = ws->GetMainWord();
    Wtroka stmp;
    SWordHomonymNum newWH;
    CWord* pNewWord = GetWordForMultiWord(*ws, stmp, newWH);

    pNewWord->m_SourceWords.SetPair(ws->FirstWord(), ws->LastWord());

    TGramBitSet art_grammems;      // output grammems of article
    Wtroka article_title;
    TKeyWordType article_type = NULL;

    if (ws->HasGztArticle()) {
        const TGztArticle& gzt_article = ws->GetGztArticle();
        article_title = gzt_article.GetTitle();
        article_type = gzt_article.GetType();
        const NGzt::TMessage* lemma = gzt_article.GetLemmaInfo();
        if (lemma != NULL)
            art_grammems = gzt_article.GetLemmaOutputGrammems(*lemma);
    } else if (ws->HasAuxArticle()) {
        const article_t* pArt = GlobalDictsHolder->GetAuxArticle(ws->GetAuxArticleIndex());
        art_grammems = pArt->get_new_pos();
        article_title = pArt->get_title();
        article_type = pArt->get_kw_type();
    }

    THomonymGrammems newGram;
    if (!ws->GetGrammems().Empty()) {
        newGram = ws->GetGrammems();
        if (!newGram.HasForms() && wh.IsValid())
            newGram.SetPOS(m_Words[wh].Grammems.GetPOS());
    } else if (wh.IsValid() && HasToAddGrammemsFromMainWord(*ws))
        newGram = m_Words[wh].Grammems;
    MergeGrammems(newGram, art_grammems, newPos);

    THomonymPtr pNewHom;
    if (pNewWord->IsMultiWord() && (pNewWord->GetSourcePair().Size() != 1 || !wh.IsValid())) {
        newWH.m_HomNum = FindOrMakeMultiwordHomonym(*ws, *pNewWord, article_type, newGram, pNewHom);
        YASSERT(newWH.IsValid());
    }

    if (pNewHom.Get() == NULL) {
        if (!pNewWord->IsMultiWord()) {
            if (wh.IsValid())
                newWH = wh;
            else {
                // just take the first homonym
                newWH.m_bOriginalWord = true;
                newWH.m_WordNum = pNewWord->GetSourcePair().FirstWord();
                newWH.m_HomNum = pNewWord->IterHomonyms().GetID();
            }
        }
        YASSERT(newWH.IsValid());
        //часто бывает ситуация, когда мы вынуждены клонировать абсолютно одинаковые
        //омонимы, различающиеся только приписанными статьями из aux_dic,
        //в случае с geo_thesaurus.cxx это чревато порождением огромного количества омонимов
        //(боле 50 для "Петров"), тогда если статьи не отличаются друг от друга полем СОСТАВ
        //приписываемыми граммемами, ЧР и KWType, то мы омонимы не клонируем а дополнительные статьи
        //записываем в CHomonym::m_KWtype2Articles. Это происходит в CWord::PutArticleIndex.
        //если мы считаем, что найденные статьи для одного и того же омонима ничем не отличаются,
        //то главное слово для неотличающихся стаей у ws одно и то же и ему приписана
        //первая попавшаяся среди неразличимы статья
        //например статьи "_петрова_2" и "_петрова_3" для нас одинаковы (отличаются только ГЕО_ЧАСТЬ
        //а это неважно для парсера) и незачем плодить омонимы
        bool bCloneAnyway = (!newGram.Empty() && !(m_Words[newWH].Grammems == newGram)) ||
                            !GlobalDictsHolder->BuiltinKWTypes().IsGeo(article_type);

        if (ws->HasAuxArticle())
            newWH.m_HomNum = m_Words.GetWord(newWH).PutAuxArticle(newWH.m_HomNum, ws->GetAuxArticleIndex(), bCloneAnyway);
        else
            newWH.m_HomNum = m_Words.GetWord(newWH).PutGztArticle(newWH.m_HomNum, ws->GetGztArticle(), bCloneAnyway);
    }
    YASSERT(newWH.IsValid());

    AddFoundArticle(article_type, article_title, newWH, searchAreaWP);
    CHomonym& h = m_Words[newWH];
    h.SetSourceWordSequence(ws);
    if (!newGram.Empty())
        h.SetGrammems(newGram);

    if (takeOnwership) {
        if (!ws->HasLemmas())
            NormalizeMultiWordHomonym(pNewWord, &h);
        m_wordSequences.push_back(ws);
    }

    return newWH;
}
static void MergeGrammems(THomonymGrammems& dst, const TGramBitSet& art_grammems, const TGramBitSet& newPos)
{
    // first, reset Part Of Speech if any
    if (newPos.HasAny(TMorph::AllPOS()))
        dst.SetPOS(newPos);
    else if (art_grammems.HasAny(TMorph::AllPOS()))
        dst.SetPOS(art_grammems);

    // take other grammems from @art_grammems if any
    TGramBitSet other = art_grammems & ~TMorph::AllPOS();
    if (other.any()) {
        // if there is a form with such grammems - just leave it alone and drop the rest ones
        bool found = false;
        for (THomonymGrammems::TFormIter it = dst.IterForms(); it.Ok(); ++it)
            if (it->HasAll(other)) {
                dst.ResetSingleForm(*it);
                found = true;
            }

        if (!found) {
            // otherwise merge all forms and replace grammems by classes
            TGramBitSet newForm = dst.All();
            newForm.ReplaceByMaskIfAny(art_grammems, NSpike::AllCases);
            newForm.ReplaceByMaskIfAny(art_grammems, NSpike::AllGenders);
            newForm.ReplaceByMaskIfAny(art_grammems, NSpike::AllNumbers);
            const TGramBitSet anim(gAnimated, gInanimated);
            newForm.ReplaceByMaskIfAny(art_grammems, anim);
            newForm.ReplaceByMaskIfAny(art_grammems, NSpike::AllTimes);
            newForm.ReplaceByMaskIfAny(art_grammems, NSpike::AllPersons);
            // just add the rest non-classified grammems
            static const TGramBitSet nonclassified = ~(NSpike::AllCases | NSpike::AllGenders | NSpike::AllNumbers |
                                                       anim | NSpike::AllTimes | NSpike::AllPersons);
            newForm |= art_grammems & nonclassified;
            dst.Reset(newForm);
        }
    }

    // if we still do not known POS, apply some workarounds:
    if (dst.GetPOS().none()) {
        dst.SetPOS(TGramBitSet(gSubstantive));
        if (!dst.HasAny(NSpike::AllCases))
            dst.Add(NSpike::AllCases);
        if (!dst.HasAny(NSpike::AllGenders))
            dst.Add(NSpike::AllGenders);
        if (!dst.HasAny(NSpike::AllNumbers))
            dst.Add(NSpike::AllNumbers);
        }

    // set a noun or adj without additional grammem as indeclinable
    if (!dst.HasAny(~TMorph::AllPOS()) &&
        (art_grammems.Has(gSubstantive) || TMorph::IsFullAdjective(art_grammems)))
        dst.Add(NSpike::AllCases | NSpike::AllGenders | NSpike::AllNumbers);

}