bool CMorphDictionary::FindWord(string& InputWordStr, bool cap, bool predict, bool max_predict, vector<CFindWordNode>& results) const { size_t nLen = InputWordStr.length(); if ( (nLen == 0) || (nLen > GLOBAL_MAX_WORD_LEN)) return false; RmlMakeUpper (InputWordStr, m_Language); CResultVector FlexResults; bool retval = FlexFind(InputWordStr, FlexResults); if( !retval ) return( false ); retval = FindBases(InputWordStr, FlexResults, results); if( retval ) return( true ); if( !predict ) return( false ); if( InputWordStr.length() < m_Predict.m_CountOfLetters ) return( false ); retval = PredictByDataBase(InputWordStr, FlexResults, results,cap,max_predict); return (retval); }
// CLemmatizer::LemmatizeWord should return true if // the word was found in the dictionary, if it was predicted, then it returns false bool CLemmatizer::LemmatizeWord(string& InputWordStr, const bool cap, const bool predict, vector<CAutomAnnotationInner>& results, bool bGetLemmaInfos) const { RmlMakeUpper (InputWordStr, GetLanguage()); size_t WordOffset = 0; m_pFormAutomat->GetInnerMorphInfos(InputWordStr, 0, results); bool bResult = !results.empty(); if (results.empty()) { if (m_bUsePrediction) { PredictBySuffix(InputWordStr, WordOffset, 4, results); // the length of the minal suffix is 4 if (InputWordStr[WordOffset-1] != '-') // and there is no hyphen { size_t KnownPostfixLen = InputWordStr.length()-WordOffset; size_t UnknownPrefixLen = WordOffset; if (KnownPostfixLen < 6)// if the known part is too short //if (UnknownPrefixLen > 5)// no prediction if unknown prefix is more than 5 { if (!IsPrefix(InputWordStr.substr(0, UnknownPrefixLen))) results.clear(); }; }; // отменяем предсказание по местоимениям, например "Семыкиным" for (size_t i=0; i<results.size(); i++) if (m_NPSs[results[i].m_ModelNo] == UnknownPartOfSpeech) { results.clear(); break; }; }; }; if (!results.empty()) { if (bGetLemmaInfos) GetLemmaInfos(InputWordStr, WordOffset, results); } else if (m_bUsePrediction) { PredictByDataBase(InputWordStr, results,cap); }; return bResult; }
// loading simple list of words from a file, lemmatizing it, and storing void CMorphDictionary::LoadFrequentRoots(string path) { string load_path = path+"predictroots.txt"; FILE*fp = fopen (load_path.c_str(), "r"); if (!fp) return; char buffer[1000]; while (fgets (buffer, 1000, fp)) { string WordStr = buffer; Trim(WordStr); if (WordStr.empty()) continue; RmlMakeUpper (WordStr, m_Language); vector<CFindWordNode> FindResults; bool retval = FindWord(WordStr, true, false, false, FindResults); if (!retval) continue; set<size_t> UsedFlexia; for (size_t i=0; i<FindResults.size(); i++) { CFormInfo P; P.Create( this, FindResults[i].m_nBase, FindResults[i].m_LemmaInfo, FindResults[i].m_nFlex ); for (size_t j=0; j < P.GetCount(); j++) { size_t FlexNo = P.GetFlexNoOfForm(j); if (UsedFlexia.find(FlexNo) == UsedFlexia.end()) { CPredictEndRoot R; R.m_BaseNo = FindResults[i].m_nBase; R.m_LemmaInfo = FindResults[i].m_LemmaInfo; R.m_EndRoot = P.GetWordForm(j); reverse(R.m_EndRoot.begin(),R.m_EndRoot.end()); R.m_FlexNo = FlexNo; UsedFlexia.insert(FlexNo); m_PredictEndRoots.push_back(R); }; }; }; }; fclose(fp); sort(m_PredictEndRoots.begin(), m_PredictEndRoots.end()); };
bool CGerSyntaxOpt :: InitOptionsLanguageSpecific() { // reading adjektives string strFileName = GetSyntaxFilePath()+"adj_prp.txt"; { if (!ReadListFile (strFileName.c_str(),(*m_pAdjPrp))) return false; // deleting valency information for (size_t i=0; i < m_pAdjPrp->size(); i++) { string& s = (*m_pAdjPrp)[i]; int q = s.find("+"); if (q != string::npos) s.erase(q); q = s.find(" "); if (q != string::npos) s.erase(q); RmlMakeUpper(s, morphGerman); }; sort(m_pAdjPrp->begin(), m_pAdjPrp->end()); }; // reading formats strFileName = GetSyntaxFilePath()+"gformats.txt"; m_FormatsGrammar.m_Language = morphGerman; m_FormatsGrammar.m_pGramTab = GetGramTab(); m_FormatsGrammar.m_SourceGrammarFile = strFileName; if (!LoadGrammarForGLR(m_FormatsGrammar, true, false)) { ErrorMessage( Format("Cannot load %s\n", strFileName.c_str())); return false; }; return true; }
bool CRusSemStructure::SetLemmasToReplace(string LemmasToReplace) { m_SynthLemmaToReplace.clear(); RmlMakeUpper(LemmasToReplace,morphRussian); StringTokenizer tok(LemmasToReplace.c_str(), ";"); while (tok()) { char lemma1[255]; char lemma2[255]; string OnePair = tok.val(); if (sscanf(OnePair.c_str(), "%[^/]/%[^/]", lemma1, lemma2) != 2) return false; string lem1=lemma1; Trim(lem1); if (lem1.empty()) return false; string lem2=lemma2; Trim(lem2); if (lem2.empty()) return false; m_SynthLemmaToReplace[lem1] = lem2; }; return true; };
bool CGrammarItem::AddAttribute(string Name, string Value, MorphLanguageEnum Language, string& ErrorStr) { if (Value.length() > 0) if (Value[0] == '"') { if ( (Value.length()<2) || (Value[Value.length() - 1] != '"')) { ErrorStr = Format("no matching quotation mark for attribute value \"%s\"",Value.c_str()); return false; }; Value = Value.substr(1, Value.length()-2); }; if (Name == "root") { m_bSynMain = true; return true; }; if (Name == "type") { m_TokenType = StringToTokenType(Value); if (m_TokenType == OTHER_TOKEN_TYPE) { ErrorStr = Format("unknown token type:%s ",Value.c_str()); return false; } }; if (Name == "hom") { if (Value == "yes") m_bCanHaveManyHomonyms = true; else if (Value == "no") m_bCanHaveManyHomonyms = false; else { ErrorStr = Format("Bad value for attribute \"hom\" (\"%s\"). It can be \"yes\" or \"no\"",Value.c_str()); return false; }; if (m_TokenType == OTHER_TOKEN_TYPE) m_TokenType = (Language == morphRussian) ? RLE : LLE; return true; }; if (Name == "grm") { m_MorphPattern.m_GrmAttribute = Value; if (m_TokenType == OTHER_TOKEN_TYPE) m_TokenType = (Language == morphRussian) ? RLE : LLE; return true; }; if (Name == "form") { m_Token = Value; RmlMakeUpper(m_Token, Language); m_ItemStrId = Value; if ( (m_TokenType == OTHER_TOKEN_TYPE) && !m_Token.empty()) { if (ispunct((BYTE)m_Token[0])) m_TokenType = PUNCTUAT; else if (isdigit((BYTE)m_Token[0])) m_TokenType = NUM; else if (Language == morphRussian) { if (CheckLanguage(m_Token, Language)) m_TokenType = RLE; } else { if (CheckLanguage(m_Token, Language)) m_TokenType = LLE; } }; return true; }; if (Name == "register") { if (Value == "AA") m_Register = UpUp; else if (Value == "aa") m_Register = LowLow; else if (Value == "Aa") m_Register = UpLow; else { ErrorStr = Format("Bad value for attribute \"register\" (\"%s\"). It can be \"AA\", \"aa\" or \"Aa\"",Value.c_str()); return false; }; if (m_TokenType == OTHER_TOKEN_TYPE) m_TokenType = (Language == morphRussian) ? RLE : LLE; return true; }; if (Name == "filename") { Value = GetPathByFile(CurrentSourceFileName) + Value; if (m_TokenType == OTHER_TOKEN_TYPE) m_TokenType = (Language == morphRussian) ? RLE : LLE; } m_Attributes[Name] = Value; return true; };
void CLemWord::SetWordStr (string NewValue, MorphLanguageEnum langua) { m_strWord = NewValue; m_strUpperWord = NewValue; RmlMakeUpper(m_strUpperWord, langua); };
//---------------------------------------------------------------------------- // Ищет слова по заданному в файле перечню. // В файле на каждой строке сначала стоит часть речи, а потом лемма. //---------------------------------------------------------------------------- void CMorphwizardView::OnToolsSelectByFile() { // TODO: Add your command handler code here CFileDialog D(TRUE, "slf", "paradigms.txt"); if (D.DoModal() != IDOK) return; FILE * fp = fopen (D.GetPathName(),"rb"); if (!fp) { AfxMessageBox ("Cannot open file"); return; }; char buf[1000]; std::string strNotFound; int ParadigmCount = 0; found_paradigms.clear(); while (fgets(buf, 1000, fp)) { std::string Line = buf; Trim(Line); if (Line.empty()) continue; StringTokenizer tok (Line.c_str(), ";"); if (!tok()) { std::string mess = std::string("cannot get lemma from ") + buf + std::string("; The format should be <Lemma>;<TypeGrammems>;<MorphPattern>"); AfxMessageBox (mess.c_str()); break; }; std::string Lemma = tok.val(); Trim(Lemma); if (!tok()) { std::string mess = std::string("cannot get type grammem ") + buf + std::string("; The format should be <Lemma> <PartofSpeech> <Grammems>"); AfxMessageBox (mess.c_str()); break; }; std::string TypeAncode; { std::string grams = tok.val(); Trim(grams); if (grams != "*") if (!GetWizard()->slf2ancode(grams, TypeAncode )) { std::string mess = std::string("cannot process type grammems ") + grams; AfxMessageBox (mess.c_str()); break; }; }; if (!tok()) { std::string mess = std::string("cannot get morphological pattern ") + buf + std::string("; The format should be <Lemma>;<TypeGrammems>;<MorphPattern>"); AfxMessageBox (mess.c_str()); break; }; std::string FirstCode; { std::string PosStr = tok.val(); Trim(PosStr); if (!GetWizard()->slf2ancode(PosStr,FirstCode )) { std::string mess = std::string("cannot process morph. pattern ") + PosStr; AfxMessageBox (mess.c_str()); break; }; }; std::vector<lemma_iterator_t> curr_found_paradigms; RmlMakeUpper(Lemma, GetWizard()->m_Language); bool bFound = false; GetWizard()->find_lemm(Lemma.c_str(), true, curr_found_paradigms); for(size_t i=0; i<curr_found_paradigms.size(); i++ ) { std::string str_pos = GetWizard()->m_FlexiaModels[curr_found_paradigms[i]->second.m_FlexiaModelNo].get_first_code();; if (curr_found_paradigms[i]->second.GetCommonAncodeIfCan() == TypeAncode) if( FirstCode == str_pos ) if (std::find(found_paradigms.begin(), found_paradigms.end(), curr_found_paradigms[i]) == found_paradigms.end()) { found_paradigms.push_back(curr_found_paradigms[i]); bFound = true; }; } if (!bFound) strNotFound += Format("Not found: %s\r\n", Line.c_str()); }; fclose(fp); FilterFoundParadigms(); ShowFoundParadigms(); if (!strNotFound.empty()) echo(strNotFound.c_str()); }
void CMorphwizardView::OnFind() { try { m_FoundList.DeleteAllItems(); found_paradigms.clear(); m_FindWhat.SetFocus(); CString find_what; m_FindWhat.GetWindowText(find_what); if (find_what == "") return; ChangeHistory((const char*)find_what); CWizardProgressMeter meter(*GetWizard()); if(GetCheckedRadioButton(IDC_RFIND_LEM,IDC_WORD_FORM)==IDC_RFIND_LEM) { std::string s = find_what; RmlMakeUpper(s, GetWizard()->m_Language); Trim(s); char ch = s[0]; if( '0'<=ch && ch<='9' ) { int prdno = atoi(s.c_str()); GetWizard()->find_lemm_by_prdno(prdno,found_paradigms); } else { if (s.substr(0,7) == "ACCENT=") { int accent_model_no = atoi(s.c_str()+7); GetWizard()->find_lemm_by_accent_model(accent_model_no, found_paradigms); } else GetWizard()->find_lemm(s.c_str(), false, found_paradigms); }; } // if(GetCheckedRadioButton(IDC_RFIND_LEM,IDC_WORD_FORM)==IDC_RFIND_GRM) GetWizard()->find_lemm_by_grammem((const char *)find_what, found_paradigms); if(GetCheckedRadioButton(IDC_RFIND_LEM,IDC_WORD_FORM)==IDC_WORD_FORM) { std::string s = find_what; // нельзя применять RmlMakeUpper из-за "*" if (GetWizard()->IsGerman()) GerMakeUpper(s); else EngRusMakeUpper(s); GetWizard()->find_wordforms(s.c_str(), found_paradigms); }; // findinsg by gramcode if(GetCheckedRadioButton(IDC_RFIND_LEM,IDC_FIND_BY_GRAMCODE)==IDC_FIND_BY_GRAMCODE) { std::string s = find_what; GetWizard()->find_ancodes(s.c_str(), found_paradigms); }; if(GetCheckedRadioButton(IDC_RFIND_LEM,IDC_FIND_BY_USERNAME)==IDC_FIND_BY_USERNAME) { std::string s = find_what; GetWizard()->find_lemm_by_user(s.c_str(), found_paradigms); }; // proceed filter string FilterFoundParadigms(); ShowFoundParadigms(); } catch (CExpc C) { ErrorMessage(C.m_strCause); } }