//-------------------------------------------------------------------------------- void init_dicts() { if (!MorphHolderRus.LoadGraphanAndLemmatizer(morphRussian)) throw CExpc("cannot load Russian morphologyn"); if (!MorphHolderEng.LoadGraphanAndLemmatizer(morphEnglish)) throw CExpc("cannot load English morphology\n"); if (!BinaryDictionary.Load()) throw CExpc("cannot load binary dictionary\n"); std::cerr << "dictionaries are loaded" << std::endl; }
void get_english_ancode(std::string e, std::string &ec) { vector<CFormInfo> ParadigmCollection; if (!MorphHolderEng.m_pLemmatizer->CreateParadigmCollection(true, e, true, false, ParadigmCollection)) throw CExpc(Format("Cannot lemmatize %s by English lemmatizer" , e.c_str())); // assert(count > 0); for(int i = 0; i < ParadigmCollection.size(); i++) { const CFormInfo& Paradigm = ParadigmCollection[i]; if(!Paradigm.m_bFound) continue; std::string ancode = Paradigm.GetAncode(0); if (ec=="na" && ( ancode=="nc" || ancode=="ne" || ancode=="ni")) { ec = ancode; return; }; if (ec=="nb" && ( ancode=="nd" || ancode=="ng" || ancode=="nk")) { ec = ancode; return; }; } }
static int InitAlphabet(MorphLanguageEnum Language, int* pCode2Alphabet, int *pAlphabet2Code, size_t AnnotChar) { assert (!is_upper_alpha(AnnotChar, Language)); string AdditionalEnglishChars = "'1234567890"; string AdditionalGermanChars = ""; int AlphabetSize = 0; for (size_t i=0; i < 256; i++) { if ( is_upper_alpha((BYTE)i, Language) || (i == '-') || (i == AnnotChar) || ( (Language == morphEnglish) && (AdditionalEnglishChars.find(i) != string::npos) ) || ( (Language == morphGerman) && (AdditionalGermanChars.find(i) != string::npos) ) || ( (Language == morphURL) && is_alpha(i, morphURL) ) ) { pCode2Alphabet[AlphabetSize] = i; pAlphabet2Code[i] = AlphabetSize; AlphabetSize++; } else pAlphabet2Code[i] = -1; }; if (AlphabetSize > MaxAlphabetSize) { string Error = "Error! The ABC is too large"; ErrorMessage (Error); throw CExpc(Error); }; return AlphabetSize; };
void CMorphwizardDoc::Serialize(CArchive& ar) { if (ar.IsStoring()) { } else { // ask username CDlgLogin dlgLogin; if( dlgLogin.DoModal() != IDOK ) throw CExpc("No login information"); CGriIni cIni; cIni.Init(); CWizardProgressMeter meter(m_Wizard); bool ret = GetWizard()->load_wizard(ar.GetFile()->GetFilePath(), dlgLogin.m_name); SetInputLanguage(GetWizard()->m_Language); cIni.Exit(); } }
int CSentence::CanLinkSimpleSimilar(int CommaWordNo) { try { /* if comma is at the very beginning of ath the end, then exit */ if ( (CommaWordNo == 0) || (CommaWordNo + 1 >= m_Words.size() ) ) return -1; if (GetOpt()->m_Language == morphGerman) { // we can use CSentence::m_GroupsUnion if Tomita is enabled for (size_t i=0; i< m_GroupsUnion.GetGroups().size(); i++) { const CGroup& group = m_GroupsUnion.GetGroups()[i]; // ignore groups which contain only three words and the second word is // a comma (a clause delimiter) if (group.size() == 3) if (m_Words[group.m_iFirstWord+1].m_bComma) continue; if ( (group.m_iFirstWord < CommaWordNo) && (group.m_iLastWord > CommaWordNo) ) return group.m_iLastWord; }; return -1; }; const int Radius = (GetOpt()->m_Language == morphGerman)? 10 : 6; int StartClauseWordNo = max(0, CommaWordNo - Radius); CSentence* pSent = GetOpt()->NewSentence(); if (!pSent) throw CExpc ("Cannot create sentence"); for (int i = StartClauseWordNo; i < min((int)m_Words.size(), CommaWordNo + Radius); i++) pSent->m_Words.push_back(m_Words[i]); CClause C(pSent, 0, pSent->m_Words.size() - 1); pSent->AddClause(C); pSent->m_bShouldUseTwoPotentialRule = false; pSent->RunSyntaxInClauses(SimpleSimilarRules); int Result = -1; const CClause& prClause = pSent->m_Clauses[0]; for (CSVI it = prClause.m_SynVariants.begin(); (Result == -1)&& (it!=prClause.m_SynVariants.end()); it++) for (size_t i=0; i< it->m_vectorGroups.GetGroups().size(); i++) { const CGroup& group = it->m_vectorGroups.GetGroups()[i]; // ignore groups which contain only three words and the second word is // a comma (a clause delimiter) if (group.size() == 3) { const CSynUnit& U = it->m_SynUnits[group.m_iFirstWord+1]; if (pSent->m_Words[U.m_SentPeriod.m_iFirstWord].m_bComma) continue; }; if ( (group.m_iFirstWord+StartClauseWordNo < CommaWordNo) && (group.m_iLastWord+StartClauseWordNo > CommaWordNo) ) { Result = group.m_iLastWord + StartClauseWordNo; break; }; } delete pSent; return Result; } catch(...) { OutputErrorString("Failed RunSyntaxInClause(CanLinkSimpleSimilar)"); return -1; } }
void CTrigramModel::InitModelFromConfigAndBuildTagset(string FileName, const CLemmatizer* Lemmatizer, const CAgramtab* GramTab, bool bLoadReverseModel) { FILE * fp = fopen (FileName.c_str(), "r"); if (!fp) throw CExpc ("cannot read file %s\n", FileName.c_str()); string TagsetFile; string ReverseModelConfig; char buffer[1000]; while (fgets(buffer, 1000, fp)) { if (strchr (buffer, '#')) *strchr (buffer, '#') = 0; StringTokenizer tok(buffer, "\r\n\t "); if (!tok()) continue; string Field = tok.val(); string Value; if (tok()) Value = tok.val(); if (Field == "NgramFile") m_NgramFile = BuildRMLPath (Value.c_str()); else if (Field == "DictionaryFile") m_DictionaryFile = BuildRMLPath (Value.c_str()); else if (Field == "TagsetFile") TagsetFile = BuildRMLPath (Value.c_str()); else if (Field == "Language") { if (!GetLanguageByString(Value, m_Language)) throw CExpc ("unknown language:%s\n", Value.c_str()); } else if (Field == "--second-local-coef") { m_SecondLocalCoef = atoi(Value.c_str()); if (!m_SecondLocalCoef) throw CExpc ("wrong second local coef: %s\n", Value.c_str()); fprintf(stderr, "second local coef: %i\n", m_SecondLocalCoef); m_bUseSecondLocalMax = true; } else if (Field == "--min-bucket-size") { m_MinBucketSize = atoi(Value.c_str()); if (!m_MinBucketSize) throw CExpc ("wrong min bucket size: %s\n", Value.c_str()); fprintf(stderr, "min bucket size: %i\n", m_MinBucketSize); } else if (Field == "reverse-model-config") { ReverseModelConfig = Value; } else if (Field == "--raw-texts") { m_bRawTexts = true; } else if (Field == "--supress-morph-errors") { m_bQuiet = true; } else if (Field == "--reverse-model") { m_bReverseModel = true; fprintf (stderr, "reverse model!\n"); } else if (Field == "--check-only-amb-words") { m_bCheckOnlyAmbiguosWords = true; fprintf (stderr, "evaluate precision and recall only for ambiguous words\n"); } }; fclose(fp); if ( m_NgramFile.empty() || m_DictionaryFile.empty() ) throw CExpc ("cannot find NgramFile or DictionaryFile in %s\n", FileName.c_str()); #ifdef USE_TRIGRAM_LEMMATIZER if (GramTab==0) { if (!InitDicts()) throw CExpc ("cannot initialize morphology\n", FileName.c_str()); } else { m_pLemmatizer = Lemmatizer; m_pAgramtab = GramTab; // m_Graphan is disabled }; if (!TagsetFile.empty()) { fprintf (stderr, "loading tagset from %s\n", TagsetFile.c_str()); if (!m_TagSet.ReadTagSet(TagsetFile, m_pAgramtab)) throw CExpc ("cannot load tagset"); } else { fprintf (stderr, "building default tagset\n"); m_TagSet.BuildDefaultTags(m_pAgramtab); } fprintf (stderr, "tag set file contains %i tags \n", m_TagSet.m_Tags.size()); #endif if (bLoadReverseModel && !ReverseModelConfig.empty()) { m_pReverseModel = new CTrigramModel(); fprintf(stderr, "load reverse model from : %s\n", ReverseModelConfig.c_str()); #ifdef USE_TRIGRAM_LEMMATIZER m_pReverseModel->InitModelFromConfigAndBuildTagset(ReverseModelConfig, m_pLemmatizer, m_pAgramtab, false); #else m_pReverseModel->InitModelFromConfigAndBuildTagset(ReverseModelConfig, 0, 0, false); #endif m_pReverseModel->ReadBinary(); } };
CDictionarySearch CTrigramModel::find_word(const string& WordStr) const { CDictionarySearch R; assert (!WordStr.empty()); if (WordStr.empty()) { //fprintf (stderr, "Empty word!\n"); R.m_pFoundWord = 0; for (WORD i=0; i < m_TagsCount; i++) R.m_PossibleWordTags.insert(i); return R; } R.m_pFoundWord = lookup_word(WordStr); if (! R.m_pFoundWord ) { // если слова нет в словаре, тогда попробуем его поискать в нижнем регистре string lower = WordStr; RmlMakeLower(lower, m_Language); R.m_pFoundWord = lookup_word(lower); } if ( R.m_pFoundWord ) { // приписываем все тэги, которые были в корпусе for (size_t i=0; i < R.m_pFoundWord->m_Length; i++) { int Tag = m_LexProbs[R.m_pFoundWord->m_StartOffset + i].m_Tag; R.m_PossibleWordTags.insert(Tag); } } // получаем все возможные тэги из морф. словар¤ map<string, const vector<CXmlMorphAnnot>* >::iterator it = m_CurrentSentenceWords2Annots.find(WordStr); if (it != m_CurrentSentenceWords2Annots.end()) get_tags_from_annots(*it->second,R.m_PossibleWordTags, WordStr); #ifdef USE_TRIGRAM_LEMMATIZER else get_tags_from_lemmatizer_but_not_preps(WordStr,R.m_PossibleWordTags); #endif if (R.m_PossibleWordTags.empty()) if ( atoi(WordStr.c_str()) > 0 && (m_Language==morphRussian) ) { for (size_t i=0; i < m_RegisteredTags.size();i++) if (m_RegisteredTags[i].length() > 3 && m_RegisteredTags[i].substr(0,4) == "„»—Ћ") R.m_PossibleWordTags.insert(i); if (R.m_PossibleWordTags.empty()) throw CExpc ("Cannot find „»—Ћ tag"); } else if ( ispunct((BYTE)WordStr[0]) || !CheckLanguage(WordStr,m_Language) ) { int tag = find_tag("UNK"); if (tag == UnknownTag) throw CExpc ("Cannot find UNK tag"); R.m_PossibleWordTags.insert(tag); } else { // приписываем все тэги if (!m_bQuiet) fprintf (stderr, "No information for word %s\n",WordStr.c_str()); for (size_t i=0; i < min((size_t)200, m_TagsOrderedByUnigrams.size()); i++) { WORD tagno = m_TagsOrderedByUnigrams[i]; string tag = m_RegisteredTags[tagno]; if (tag.length()> 1 || !ispunct((unsigned char)tag[0])) R.m_PossibleWordTags.insert(tagno); } } return R; }
void CMorphAutomatBuilder::ConvertBuildRelationsToRelations() { if (!m_pRoot) return; m_pRoot->SetNodeIdNullRecursive(); queue<CTrieNodeBuild*> NodesQueue; NodesQueue.push(m_pRoot); m_pRoot->m_NodeId = 0; vector<CMorphAutomNode> Nodes; vector<CMorphAutomRelation> Relations; while (!NodesQueue.empty()) { // getting an element from the queue CTrieNodeBuild* pNode = NodesQueue.front(); NodesQueue.pop(); CMorphAutomNode N; N.SetFinal(pNode->m_bFinal); N.SetChildrenStart(Relations.size()); assert (N.GetChildrenStart() == Relations.size()); assert (N.IsFinal() == pNode->m_bFinal); Nodes.push_back(N); int CurrentNodeId = Nodes.size() + NodesQueue.size(); for (size_t i=0; i < MaxAlphabetSize; i++) if (pNode->m_Children[i]) { CTrieNodeBuild* Child = pNode->m_Children[i]; if (Child->m_NodeId == -1) { Child->m_NodeId = CurrentNodeId++; NodesQueue.push(Child); }; // adding new relation CMorphAutomRelation R; R.SetRelationalChar(m_Code2Alphabet[i]); R.SetChildNo(Child->m_NodeId); assert (R.GetChildNo() == Child->m_NodeId); assert (R.GetRelationalChar() == m_Code2Alphabet[i]); Relations.push_back(R); if (Relations.size() > 0xffffff) { throw CExpc("Too many children in the automat. It cannot be more than 0xffffff"); }; }; }; Clear(); m_NodesCount = Nodes.size(); m_pNodes = new CMorphAutomNode[m_NodesCount]; copy(Nodes.begin(), Nodes.end(), m_pNodes); m_RelationsCount = Relations.size(); m_pRelations = new CMorphAutomRelation[m_RelationsCount]; copy(Relations.begin(), Relations.end(), m_pRelations); };
// порождает по числительному генитивный вариант. // например, два=>двух // это генитивный вариант используетс¤ в слвоах типа "двухламповый" bool BuildGenitFormOfCardinal(const CLemmatizer* piRusLemmatizer, const CRusGramTab* Agramtab) { GenitFormsOfCardinal.clear(); for(int i = 0 ; i < NumeralToNumberCount; i++ ) { if (NumeralToNumber[i].m_Number == 0) { GenitFormsOfCardinal.push_back("Ќ”Ћ№"); continue; }; if (NumeralToNumber[i].m_Number == 1) { GenitFormsOfCardinal.push_back("ќƒЌќ"); continue; }; if (NumeralToNumber[i].m_Number == 100) { GenitFormsOfCardinal.push_back("—“ќ"); continue; }; if (NumeralToNumber[i].m_Number == 1000) { GenitFormsOfCardinal.push_back("“џ—я„≈"); continue; }; if (NumeralToNumber[i].m_Number == 1000000) { GenitFormsOfCardinal.push_back("ћ»ЋЋ»ќЌќ"); continue; }; if (NumeralToNumber[i].m_Number == 1000000000) { GenitFormsOfCardinal.push_back("ћ»ЋЋ»ј–ƒЌќ"); continue; }; if (NumeralToNumber[i].m_Number == 1000000000000.0) { GenitFormsOfCardinal.push_back("“–»ЋЋ»ќЌќ"); continue; }; if (NumeralToNumber[i].m_Number == 1000000000000000.0) { GenitFormsOfCardinal.push_back(" ¬јƒ–»ЋЋ»ќЌќ"); continue; }; vector<CFormInfo> ParadigmCollection; string WordForm = NumeralToNumber[i].m_Cardinal; piRusLemmatizer->CreateParadigmCollection(true, WordForm, false, false, ParadigmCollection); // ищем числительное long k=0; for (; k < ParadigmCollection.size(); k++) { string AnCode = ParadigmCollection[k].GetAncode(0); BYTE POS = Agramtab->GetPartOfSpeech(AnCode.c_str()); if (NumeralToNumber[i].m_bNoun) { if (POS == NOUN) break; } else if (POS == NUMERAL) break; }; assert (k < ParadigmCollection.size()); const CFormInfo& P = ParadigmCollection[k]; // ищем родительный падеж for (k=0; k < P.GetCount(); k++) { string AnCode = P.GetAncode(k); QWORD Grammems; if (!Agramtab->GetGrammems(AnCode.c_str(), Grammems)) throw CExpc ("Bad ancode in BuildGenitFormOfCardinal"); if ( (Grammems & _QM(rGenitiv)) > 0 ) break; }; assert (k < P.GetCount()); GenitFormsOfCardinal.push_back(P.GetWordForm(k)); }; return true; };
DWORD CSynDictionary::GetId(UINT Index) const { // TODO: Add your implementation code here if(curr_synset == synonims.end()) throw CExpc ("bad index in synonyms vector"); return curr_synset->second[Index]; }