bool InitMorphologySystem(JNIEnv *env, jni_dictionary &dic){ switch (dic.Language){ case morphRussian: dic.lang=Russian; break; case morphEnglish : case morphGerman: default: throwEx(env, strdup("assertion error: A1")); return false; } string langua_str = GetStringByLanguage(dic.Language); dic.pLemmatizer = new T; string strError; if (!dic.pLemmatizer->LoadDictionariesRegistry(strError)){ char* err=str_compose("Cannot load %s morphological dictionary. Error details: %s", langua_str.c_str(), strError.c_str()); throwEx(env, err); return false; } dic.pAgramtab = new Y; if (!dic.pAgramtab->LoadFromRegistry()){ char* err=str_compose("Cannot load %s gramtab.", langua_str.c_str()); throwEx(env, err); return false; } return true; }
bool CMorphDictBuilder::GenPredictIdx(const MorphoWizard& wizard, int PostfixLength, int MinFreq, string path) { DwordVector ModelFreq(wizard.m_FlexiaModels.size(), 0); // building frequences of flexia models for(const_lemma_iterator_t lnMapIt = wizard.m_LemmaToParadigm.begin(); lnMapIt != wizard.m_LemmaToParadigm.end(); lnMapIt++) ModelFreq[lnMapIt->second.m_FlexiaModelNo]++; bool bSparsedDictionary; { int Count=0; for (size_t ModelNo=0; ModelNo<ModelFreq.size(); ModelNo++) if (ModelFreq[ModelNo] >= MinimalFlexiaModelFrequence) Count++; bSparsedDictionary = 2*Count < ModelFreq.size(); if (bSparsedDictionary) fprintf (stderr, "Flexia models are too sparsed\n"); }; string PlugLemma = GetPlugLemmabyLanguage(wizard.m_Language); int PlugLemmaInfoNo = -1; Flex2WordMap svMapRaw; // going through all words for(size_t lin =0; lin < m_LemmaInfos.size(); lin++) { if (!(lin%1000)) log ( Format("Pick up data...%i \r", lin) ) ; const CLemmaInfo& LemmaInfo = m_LemmaInfos[lin].m_LemmaInfo; size_t ModelNo = LemmaInfo.m_FlexiaModelNo; const CFlexiaModel& paradigm = m_FlexiaModels[ModelNo]; string base = m_Bases[m_LemmaInfos[lin].m_LemmaStrNo].GetString(); if (base+paradigm.get_first_flex() == PlugLemma) { PlugLemmaInfoNo = lin; continue; }; if (!bSparsedDictionary) if (ModelFreq[ModelNo] < MinimalFlexiaModelFrequence) continue; string pos = wizard.get_pos_string(paradigm.get_first_code()); WORD nps = GetPredictionPartOfSpeech(pos, wizard.m_Language); if (nps == UnknownPartOfSpeech) continue; const vector <bool>& Infos = m_ModelInfo[ModelNo]; for (size_t i=0; i<paradigm.m_Flexia.size(); i++) if (Infos[i]) { string flexia = paradigm.m_Flexia[i].m_FlexiaStr; string wordform = base + flexia; if (wordform.length() < PostfixLength) continue; string Postfix = wordform.substr(wordform.length() - PostfixLength); AddElem(svMapRaw, Postfix, lin, nps, i, ModelFreq, m_LemmaInfos); } } if (PlugLemmaInfoNo == -1) { ErrorMessage (Format("Cannot find a word for the default noun prediction (\"%s\") while generating %s prediction base",PlugLemma.c_str(), GetStringByLanguage(wizard.m_Language).c_str())); return false; }; log("Saving...\n"); CMorphAutomatBuilder R(wizard.m_Language); R.InitTrie(); // adding crtitical noun { string s = CriticalNounLetterPack; s += AnnotChar; s += R.EncodeIntToAlphabet(0); // noun s += AnnotChar; s += R.EncodeIntToAlphabet(PlugLemmaInfoNo); s += AnnotChar; s += R.EncodeIntToAlphabet(0); R.AddStringDaciuk(s); }; for( Flex2WordMap::const_iterator it=svMapRaw.begin(); it!=svMapRaw.end(); it++ ) { for( int i=0; i<it->second.size(); i++ ) { const CPredictWord& W = it->second[i]; // checking minimal frequence if (W.m_Freq < MinFreq) continue; string s = it->first; reverse(s.begin(), s.end()); s += AnnotChar; s += R.EncodeIntToAlphabet(W.m_nps); s += AnnotChar; s += R.EncodeIntToAlphabet(W.m_LemmaInfoNo); s += AnnotChar; s += R.EncodeIntToAlphabet(W.m_ItemNo); R.AddStringDaciuk(s); } }; R.ConvertBuildRelationsToRelations(); R.Save(path + PREDICT_BIN_PATH); svMapRaw.clear(); return true; }
bool CMorphAutomat::Load(string AutomatFileName) { Clear(); FILE * fp = fopen(AutomatFileName.c_str(), "rb"); if (!fp) { ErrorMessage (Format("Cannot open %s", AutomatFileName.c_str())); return false; }; char buffer [256]; if (!fgets(buffer, 256, fp)) return false; m_NodesCount = atoi(buffer); if (!m_NodesCount) return false; assert (m_pNodes == 0); m_pNodes = new CMorphAutomNode[m_NodesCount]; assert (m_pNodes != 0); if (fread(m_pNodes, sizeof(CMorphAutomNode),m_NodesCount, fp) != m_NodesCount) return false; if (!fgets(buffer, 256, fp)) return false; m_RelationsCount = atoi(buffer); assert (m_pRelations == 0); m_pRelations = new CMorphAutomRelation[m_RelationsCount]; assert (m_pRelations != 0); if (fread(m_pRelations, sizeof(CMorphAutomRelation),m_RelationsCount, fp) != m_RelationsCount) return false; { int Alphabet2Code[256]; fread(Alphabet2Code,sizeof(int),256,fp); if (memcmp(Alphabet2Code,m_Alphabet2Code, 256*sizeof(int)) ) { string err = Format("%s alphabet has changed; cannot load morph automat", GetStringByLanguage(m_Language).c_str()); ErrorMessage(err); return false; }; }; fclose(fp); BuildChildrenCache(); return true; };