bool CLemmatizer::CreateParadigmCollection(bool bNorm, string& InputWordStr, bool capital, vector<CFormInfo>& Result) const { FilterSrc(InputWordStr); vector<CAutomAnnotationInner> FindResults; bool bFound = LemmatizeWord(InputWordStr, capital, m_bUsePrediction, FindResults, true); if (FindResults.empty()) { // the word cannot be predicted or m_bUsePrediction is false return false; }; AssignWeightIfNeed(FindResults); for (size_t i = 0; i < FindResults.size(); i++) { const CAutomAnnotationInner& A = FindResults[i]; // if bNorm, then ignore words which are not lemma if ( bNorm && (A.m_ItemNo!=0)) continue; CFormInfo P; P.Create(this, A, InputWordStr, bFound); Result.push_back(P); } return true; }
// loading simple list of words from a file, lemmatizing it, and storing void CMorphDictionary::LoadFrequentRoots(string path) { string load_path = path+"predictroots.txt"; FILE*fp = fopen (load_path.c_str(), "r"); if (!fp) return; char buffer[1000]; while (fgets (buffer, 1000, fp)) { string WordStr = buffer; Trim(WordStr); if (WordStr.empty()) continue; RmlMakeUpper (WordStr, m_Language); vector<CFindWordNode> FindResults; bool retval = FindWord(WordStr, true, false, false, FindResults); if (!retval) continue; set<size_t> UsedFlexia; for (size_t i=0; i<FindResults.size(); i++) { CFormInfo P; P.Create( this, FindResults[i].m_nBase, FindResults[i].m_LemmaInfo, FindResults[i].m_nFlex ); for (size_t j=0; j < P.GetCount(); j++) { size_t FlexNo = P.GetFlexNoOfForm(j); if (UsedFlexia.find(FlexNo) == UsedFlexia.end()) { CPredictEndRoot R; R.m_BaseNo = FindResults[i].m_nBase; R.m_LemmaInfo = FindResults[i].m_LemmaInfo; R.m_EndRoot = P.GetWordForm(j); reverse(R.m_EndRoot.begin(),R.m_EndRoot.end()); R.m_FlexNo = FlexNo; UsedFlexia.insert(FlexNo); m_PredictEndRoots.push_back(R); }; }; }; }; fclose(fp); sort(m_PredictEndRoots.begin(), m_PredictEndRoots.end()); };